VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 97161

Last change on this file since 97161 was 97161, checked in by vboxsync, 2 years ago

VMM/IEM: Implementation for fpatan instruction, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 642.6 KB
Line 
1/* $Id: IEMAllAImplC.cpp 97161 2022-10-14 19:25:34Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6185{
6186 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6187 AssertReleaseFailed();
6188}
6189#endif /* IEM_WITHOUT_ASSEMBLY */
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6194}
6195
6196IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6197{
6198 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6199}
6200
6201
6202#ifdef IEM_WITHOUT_ASSEMBLY
6203
6204static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6205{
6206 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6207 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6208 extFloat80_t v;
6209 (void)fFcw;
6210
6211 v = extF80_sin(x, &SoftState);
6212
6213 iemFpuSoftF80ToIprt(pr80Result, v);
6214
6215 return fFsw;
6216}
6217
6218IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6219{
6220 uint16_t const fFcw = pFpuState->FCW;
6221 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6222
6223 if (RTFLOAT80U_IS_ZERO(pr80Val))
6224 {
6225 pFpuRes->r80Result = *pr80Val;
6226 }
6227 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6228 {
6229 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6230 {
6231 fFsw |= X86_FSW_C2;
6232 pFpuRes->r80Result = *pr80Val;
6233 }
6234 else
6235 {
6236 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6237 {
6238 pFpuRes->r80Result = *pr80Val;
6239
6240 }
6241 else
6242 {
6243 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6244 }
6245 fFsw |= X86_FSW_PE;
6246 if (!(fFcw & X86_FCW_PM))
6247 fFsw |= X86_FSW_ES | X86_FSW_B;
6248 }
6249 }
6250 else if (RTFLOAT80U_IS_INF(pr80Val))
6251 {
6252 fFsw |= X86_FSW_IE;
6253 if (!(fFcw & X86_FCW_IM))
6254 {
6255 fFsw |= X86_FSW_ES | X86_FSW_B;
6256 pFpuRes->r80Result = *pr80Val;
6257 }
6258 else
6259 {
6260 pFpuRes->r80Result = g_r80Indefinite;
6261 }
6262 }
6263 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6264 {
6265 pFpuRes->r80Result = *pr80Val;
6266 fFsw |= X86_FSW_DE;
6267
6268 if (fFcw & X86_FCW_DM)
6269 {
6270 fFsw |= X86_FSW_UE | X86_FSW_PE;
6271
6272 if (!(fFcw & X86_FCW_UM) || !(fFcw & X86_FCW_PM))
6273 {
6274 fFsw |= X86_FSW_ES | X86_FSW_B;
6275 }
6276 }
6277 else
6278 {
6279 fFsw |= X86_FSW_ES | X86_FSW_B;
6280 }
6281 }
6282 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6283 {
6284 pFpuRes->r80Result = *pr80Val;
6285 fFsw |= X86_FSW_DE;
6286
6287 if (fFcw & X86_FCW_DM)
6288 {
6289 if (fFcw & X86_FCW_PM)
6290 {
6291 fFsw |= X86_FSW_PE;
6292 }
6293 else
6294 {
6295 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6296 }
6297
6298 pFpuRes->r80Result.sj64.uExponent = 1;
6299 }
6300 else
6301 {
6302 fFsw |= X86_FSW_ES | X86_FSW_B;
6303 }
6304 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6305 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6306 {
6307 pFpuRes->r80Result = *pr80Val;
6308 } else {
6309 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6310 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6311 && (fFcw & X86_FCW_IM))
6312 pFpuRes->r80Result = g_r80Indefinite;
6313 else
6314 {
6315 pFpuRes->r80Result = *pr80Val;
6316 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6317 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6318 }
6319
6320 fFsw |= X86_FSW_IE;
6321 if (!(fFcw & X86_FCW_IM))
6322 fFsw |= X86_FSW_ES | X86_FSW_B;
6323 }
6324
6325 pFpuRes->FSW = fFsw;
6326}
6327#endif /* IEM_WITHOUT_ASSEMBLY */
6328
6329IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6330{
6331 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6332}
6333
6334IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6335{
6336 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6337}
6338
6339#ifdef IEM_WITHOUT_ASSEMBLY
6340
6341static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6342{
6343 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6344 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6345 extFloat80_t v;
6346 (void)fFcw;
6347
6348 v = extF80_cos(x, &SoftState);
6349
6350 iemFpuSoftF80ToIprt(pr80Result, v);
6351
6352 return fFsw;
6353}
6354
6355IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6356{
6357 uint16_t const fFcw = pFpuState->FCW;
6358 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6359
6360 if (RTFLOAT80U_IS_ZERO(pr80Val))
6361 {
6362 pFpuRes->r80Result = g_ar80One[0];
6363 }
6364 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6365 {
6366 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6367 {
6368 fFsw |= X86_FSW_C2;
6369 pFpuRes->r80Result = *pr80Val;
6370 }
6371 else
6372 {
6373 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6374 {
6375 pFpuRes->r80Result = g_ar80One[0];
6376
6377 }
6378 else
6379 {
6380 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6381 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6382 }
6383 fFsw |= X86_FSW_PE;
6384 if (!(fFcw & X86_FCW_PM))
6385 fFsw |= X86_FSW_ES | X86_FSW_B;
6386 }
6387 }
6388 else if (RTFLOAT80U_IS_INF(pr80Val))
6389 {
6390 fFsw |= X86_FSW_IE;
6391 if (!(fFcw & X86_FCW_IM))
6392 {
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 pFpuRes->r80Result = *pr80Val;
6395 }
6396 else
6397 {
6398 pFpuRes->r80Result = g_r80Indefinite;
6399 }
6400 }
6401 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6402 {
6403 fFsw |= X86_FSW_DE;
6404
6405 if (fFcw & X86_FCW_DM)
6406 {
6407 pFpuRes->r80Result = g_ar80One[0];
6408
6409 if (fFcw & X86_FCW_PM)
6410 {
6411 fFsw |= X86_FSW_PE;
6412 }
6413 else
6414 {
6415 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6416 }
6417 }
6418 else
6419 {
6420 pFpuRes->r80Result = *pr80Val;
6421 fFsw |= X86_FSW_ES | X86_FSW_B;
6422 }
6423 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6424 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6425 {
6426 pFpuRes->r80Result = *pr80Val;
6427 } else {
6428 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6429 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6430 && (fFcw & X86_FCW_IM))
6431 pFpuRes->r80Result = g_r80Indefinite;
6432 else
6433 {
6434 pFpuRes->r80Result = *pr80Val;
6435 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6436 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6437 }
6438
6439 fFsw |= X86_FSW_IE;
6440 if (!(fFcw & X86_FCW_IM))
6441 fFsw |= X86_FSW_ES | X86_FSW_B;
6442 }
6443
6444 pFpuRes->FSW = fFsw;
6445}
6446#endif /* IEM_WITHOUT_ASSEMBLY */
6447
6448IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6449{
6450 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6451}
6452
6453IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6454{
6455 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6456}
6457
6458#ifdef IEM_WITHOUT_ASSEMBLY
6459
6460static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6461{
6462 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6463 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6464 extFloat80_t r80Sin, r80Cos;
6465 (void)fFcw;
6466
6467 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6468
6469 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6470 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6471
6472 return fFsw;
6473}
6474
6475IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6476{
6477 uint16_t const fFcw = pFpuState->FCW;
6478 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6479
6480 if (RTFLOAT80U_IS_ZERO(pr80Val))
6481 {
6482 pFpuResTwo->r80Result1 = *pr80Val;
6483 pFpuResTwo->r80Result2 = g_ar80One[0];
6484 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6485 }
6486 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6487 {
6488 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6489 {
6490 fFsw |= X86_FSW_C2;
6491
6492 if (fFcw & X86_FCW_IM)
6493 {
6494 pFpuResTwo->r80Result1 = g_r80Indefinite;
6495 }
6496 else
6497 {
6498 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6499 }
6500
6501 pFpuResTwo->r80Result2 = *pr80Val;
6502 }
6503 else
6504 {
6505 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6506
6507 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6508 {
6509 pFpuResTwo->r80Result1 = *pr80Val;
6510 pFpuResTwo->r80Result2 = g_ar80One[0];
6511 }
6512 else
6513 {
6514 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6515 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6516 }
6517 fFsw |= X86_FSW_PE;
6518 if (!(fFcw & X86_FCW_PM))
6519 fFsw |= X86_FSW_ES | X86_FSW_B;
6520 }
6521 }
6522 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6523 {
6524 fFsw |= X86_FSW_DE;
6525
6526 if (fFcw & X86_FCW_DM)
6527 {
6528 pFpuResTwo->r80Result1 = *pr80Val;
6529 pFpuResTwo->r80Result2 = g_ar80One[0];
6530 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6531
6532 if (fFcw & X86_FCW_PM)
6533 {
6534 fFsw |= X86_FSW_PE;
6535 }
6536 else
6537 {
6538 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6539 }
6540
6541 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6542 }
6543 else
6544 {
6545 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6546 pFpuResTwo->r80Result2 = *pr80Val;
6547 fFsw |= X86_FSW_ES | X86_FSW_B;
6548 }
6549 }
6550 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6551 {
6552 fFsw |= X86_FSW_DE;
6553
6554 if (fFcw & X86_FCW_DM)
6555 {
6556 pFpuResTwo->r80Result1 = *pr80Val;
6557 pFpuResTwo->r80Result2 = g_ar80One[0];
6558
6559 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6560 fFsw |= X86_FSW_UE | X86_FSW_PE;
6561
6562 if (fFcw & X86_FCW_PM)
6563 {
6564 if (!(fFcw & X86_FCW_UM))
6565 fFsw |= X86_FSW_ES | X86_FSW_B;
6566 }
6567 else
6568 {
6569 fFsw |= X86_FSW_ES | X86_FSW_B;
6570 }
6571 }
6572 else
6573 {
6574 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6575 pFpuResTwo->r80Result2 = *pr80Val;
6576 fFsw |= X86_FSW_ES | X86_FSW_B;
6577 }
6578 }
6579 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6580 {
6581 pFpuResTwo->r80Result1 = *pr80Val;
6582 pFpuResTwo->r80Result2 = *pr80Val;
6583 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6584 }
6585 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6586 {
6587 if (fFcw & X86_FCW_IM)
6588 {
6589 pFpuResTwo->r80Result1 = g_r80Indefinite;
6590 pFpuResTwo->r80Result2 = g_r80Indefinite;
6591 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6592 }
6593 else
6594 {
6595 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6596 pFpuResTwo->r80Result2 = *pr80Val;
6597 }
6598
6599 fFsw |= X86_FSW_IE;
6600 if (!(fFcw & X86_FCW_IM))
6601 fFsw |= X86_FSW_ES | X86_FSW_B;
6602 }
6603 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6604 {
6605 pFpuResTwo->r80Result1 = *pr80Val;
6606 pFpuResTwo->r80Result2 = *pr80Val;
6607
6608 if (fFcw & X86_FCW_IM)
6609 {
6610 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6611 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6612 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6613 }
6614 else
6615 {
6616 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6617 pFpuResTwo->r80Result2 = *pr80Val;
6618 }
6619
6620 fFsw |= X86_FSW_IE;
6621 if (!(fFcw & X86_FCW_IM))
6622 fFsw |= X86_FSW_ES | X86_FSW_B;
6623 }
6624 else if (RTFLOAT80U_IS_INF(pr80Val))
6625 {
6626 if (fFcw & X86_FCW_IM)
6627 {
6628 pFpuResTwo->r80Result1 = g_r80Indefinite;
6629 pFpuResTwo->r80Result2 = g_r80Indefinite;
6630 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6631 }
6632 else
6633 {
6634 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6635 pFpuResTwo->r80Result2 = *pr80Val;
6636 }
6637
6638 fFsw |= X86_FSW_IE;
6639 if (!(fFcw & X86_FCW_IM))
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642
6643 pFpuResTwo->FSW = fFsw;
6644}
6645#endif /* IEM_WITHOUT_ASSEMBLY */
6646
6647IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6648{
6649 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6650}
6651
6652IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6653{
6654 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6655}
6656
6657#ifdef IEM_WITHOUT_ASSEMBLY
6658
6659
6660/*********************************************************************************************************************************
6661* x87 FPU Compare and Testing Operations *
6662*********************************************************************************************************************************/
6663
6664IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6665{
6666 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6667
6668 if (RTFLOAT80U_IS_ZERO(pr80Val))
6669 fFsw |= X86_FSW_C3;
6670 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6671 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6672 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6673 {
6674 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6675 if (!(pFpuState->FCW & X86_FCW_DM))
6676 fFsw |= X86_FSW_ES | X86_FSW_B;
6677 }
6678 else
6679 {
6680 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6681 if (!(pFpuState->FCW & X86_FCW_IM))
6682 fFsw |= X86_FSW_ES | X86_FSW_B;
6683 }
6684
6685 *pu16Fsw = fFsw;
6686}
6687
6688
6689IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6690{
6691 RT_NOREF(pFpuState);
6692 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6693
6694 /* C1 = sign bit (always, even if empty Intel says). */
6695 if (pr80Val->s.fSign)
6696 fFsw |= X86_FSW_C1;
6697
6698 /* Classify the value in C0, C2, C3. */
6699 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6700 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6701 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6702 fFsw |= X86_FSW_C2;
6703 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6704 fFsw |= X86_FSW_C3;
6705 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6706 fFsw |= X86_FSW_C0;
6707 else if (RTFLOAT80U_IS_INF(pr80Val))
6708 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6709 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6710 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6711 /* whatever else: 0 */
6712
6713 *pu16Fsw = fFsw;
6714}
6715
6716
6717/**
6718 * Worker for fcom, fucom, and friends.
6719 */
6720static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6721 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6722{
6723 /*
6724 * Unpack the values.
6725 */
6726 bool const fSign1 = pr80Val1->s.fSign;
6727 int32_t iExponent1 = pr80Val1->s.uExponent;
6728 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6729
6730 bool const fSign2 = pr80Val2->s.fSign;
6731 int32_t iExponent2 = pr80Val2->s.uExponent;
6732 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6733
6734 /*
6735 * Check for invalid inputs.
6736 */
6737 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6738 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6739 {
6740 if (!(fFcw & X86_FCW_IM))
6741 fFsw |= X86_FSW_ES | X86_FSW_B;
6742 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6743 }
6744
6745 /*
6746 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6747 */
6748 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6749 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6750 {
6751 if ( fIeOnAllNaNs
6752 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6753 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6754 {
6755 fFsw |= X86_FSW_IE;
6756 if (!(fFcw & X86_FCW_IM))
6757 fFsw |= X86_FSW_ES | X86_FSW_B;
6758 }
6759 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6760 }
6761
6762 /*
6763 * Normalize the values.
6764 */
6765 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6766 {
6767 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6768 iExponent1 = 1;
6769 else
6770 {
6771 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6772 uMantissa1 <<= iExponent1;
6773 iExponent1 = 1 - iExponent1;
6774 }
6775 fFsw |= X86_FSW_DE;
6776 if (!(fFcw & X86_FCW_DM))
6777 fFsw |= X86_FSW_ES | X86_FSW_B;
6778 }
6779
6780 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6781 {
6782 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6783 iExponent2 = 1;
6784 else
6785 {
6786 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6787 uMantissa2 <<= iExponent2;
6788 iExponent2 = 1 - iExponent2;
6789 }
6790 fFsw |= X86_FSW_DE;
6791 if (!(fFcw & X86_FCW_DM))
6792 fFsw |= X86_FSW_ES | X86_FSW_B;
6793 }
6794
6795 /*
6796 * Test if equal (val1 == val2):
6797 */
6798 if ( uMantissa1 == uMantissa2
6799 && iExponent1 == iExponent2
6800 && ( fSign1 == fSign2
6801 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6802 fFsw |= X86_FSW_C3;
6803 /*
6804 * Test if less than (val1 < val2):
6805 */
6806 else if (fSign1 && !fSign2)
6807 fFsw |= X86_FSW_C0;
6808 else if (fSign1 == fSign2)
6809 {
6810 /* Zeros are problematic, however at the most one can be zero here. */
6811 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6812 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6813 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6814 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6815
6816 if ( fSign1
6817 ^ ( iExponent1 < iExponent2
6818 || ( iExponent1 == iExponent2
6819 && uMantissa1 < uMantissa2 ) ) )
6820 fFsw |= X86_FSW_C0;
6821 }
6822 /* else: No flags set if greater. */
6823
6824 return fFsw;
6825}
6826
6827
6828IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6829 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6830{
6831 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6832}
6833
6834
6835
6836
6837IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6838 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6839{
6840 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6841}
6842
6843
6844IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6845 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6846{
6847 RTFLOAT80U r80Val2;
6848 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6849 Assert(!fFsw || fFsw == X86_FSW_DE);
6850 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6851 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6852 {
6853 if (!(pFpuState->FCW & X86_FCW_DM))
6854 fFsw |= X86_FSW_ES | X86_FSW_B;
6855 *pfFsw |= fFsw;
6856 }
6857}
6858
6859
6860IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6861 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6862{
6863 RTFLOAT80U r80Val2;
6864 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6865 Assert(!fFsw || fFsw == X86_FSW_DE);
6866 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6867 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6868 {
6869 if (!(pFpuState->FCW & X86_FCW_DM))
6870 fFsw |= X86_FSW_ES | X86_FSW_B;
6871 *pfFsw |= fFsw;
6872 }
6873}
6874
6875
6876IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6877 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6878{
6879 RTFLOAT80U r80Val2;
6880 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6881 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6882}
6883
6884
6885IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6886 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6887{
6888 RTFLOAT80U r80Val2;
6889 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6890 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6891}
6892
6893
6894/**
6895 * Worker for fcomi & fucomi.
6896 */
6897static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6898 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6899{
6900 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6901 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6902 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6903 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6904
6905 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6906 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6907 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6908}
6909
6910
6911IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6912 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6913{
6914 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6915}
6916
6917
6918IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6919 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6920{
6921 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6922}
6923
6924
6925/*********************************************************************************************************************************
6926* x87 FPU Other Operations *
6927*********************************************************************************************************************************/
6928
6929/**
6930 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6931 */
6932static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6933{
6934 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6935 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6936 true /*exact / generate #PE */, &SoftState));
6937 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6938}
6939
6940
6941IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6942{
6943 uint16_t const fFcw = pFpuState->FCW;
6944 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6945
6946 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6947 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6948 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6949 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6950 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6951 || RTFLOAT80U_IS_INF(pr80Val))
6952 pFpuRes->r80Result = *pr80Val;
6953 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6954 {
6955 fFsw |= X86_FSW_DE;
6956 if (fFcw & X86_FCW_DM)
6957 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6958 else
6959 {
6960 pFpuRes->r80Result = *pr80Val;
6961 fFsw |= X86_FSW_ES | X86_FSW_B;
6962 }
6963 }
6964 else
6965 {
6966 if (fFcw & X86_FCW_IM)
6967 {
6968 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6969 pFpuRes->r80Result = g_r80Indefinite;
6970 else
6971 {
6972 pFpuRes->r80Result = *pr80Val;
6973 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6974 }
6975 }
6976 else
6977 {
6978 pFpuRes->r80Result = *pr80Val;
6979 fFsw |= X86_FSW_ES | X86_FSW_B;
6980 }
6981 fFsw |= X86_FSW_IE;
6982 }
6983 pFpuRes->FSW = fFsw;
6984}
6985
6986
6987IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6988 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6989{
6990 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6991 it does everything we need it to do. */
6992 uint16_t const fFcw = pFpuState->FCW;
6993 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6994 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6995 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6996 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6997}
6998
6999
7000/**
7001 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7002 */
7003static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7004{
7005 Assert(!pr80Val->s.fSign);
7006 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7007 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7008 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7009}
7010
7011
7012IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7013{
7014 uint16_t const fFcw = pFpuState->FCW;
7015 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7016
7017 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7018 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7019 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7020 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7021 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7022 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7023 pFpuRes->r80Result = *pr80Val;
7024 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7025 {
7026 fFsw |= X86_FSW_DE;
7027 if (fFcw & X86_FCW_DM)
7028 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7029 else
7030 {
7031 pFpuRes->r80Result = *pr80Val;
7032 fFsw |= X86_FSW_ES | X86_FSW_B;
7033 }
7034 }
7035 else
7036 {
7037 if (fFcw & X86_FCW_IM)
7038 {
7039 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7040 pFpuRes->r80Result = g_r80Indefinite;
7041 else
7042 {
7043 pFpuRes->r80Result = *pr80Val;
7044 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7045 }
7046 }
7047 else
7048 {
7049 pFpuRes->r80Result = *pr80Val;
7050 fFsw |= X86_FSW_ES | X86_FSW_B;
7051 }
7052 fFsw |= X86_FSW_IE;
7053 }
7054 pFpuRes->FSW = fFsw;
7055}
7056
7057
7058/**
7059 * @code{.unparsed}
7060 * x x * ln2
7061 * f(x) = 2 - 1 = e - 1
7062 *
7063 * @endcode
7064 *
7065 * We can approximate e^x by a Taylor/Maclaurin series (see
7066 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7067 * @code{.unparsed}
7068 * n 0 1 2 3 4
7069 * inf x x x x x x
7070 * SUM ----- = --- + --- + --- + --- + --- + ...
7071 * n=0 n! 0! 1! 2! 3! 4!
7072 *
7073 * 2 3 4
7074 * x x x
7075 * = 1 + x + --- + --- + --- + ...
7076 * 2! 3! 4!
7077 * @endcode
7078 *
7079 * Given z = x * ln2, we get:
7080 * @code{.unparsed}
7081 * 2 3 4 n
7082 * z z z z z
7083 * e - 1 = z + --- + --- + --- + ... + ---
7084 * 2! 3! 4! n!
7085 * @endcode
7086 *
7087 * Wanting to use Horner's method, we move one z outside and get:
7088 * @code{.unparsed}
7089 * 2 3 (n-1)
7090 * z z z z
7091 * = z ( 1 + --- + --- + --- + ... + ------- )
7092 * 2! 3! 4! n!
7093 * @endcode
7094 *
7095 * The constants we need for using Horner's methods are 1 and 1 / n!.
7096 *
7097 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7098 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7099 * and can approximate it to be 1.0. For a visual demonstration of this
7100 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7101 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7102 *
7103 *
7104 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7105 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7106 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7107 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7108 * blocks). (The one bit difference is probably an implicit one missing from
7109 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7110 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7111 * exponent.
7112 *
7113 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7114 * successfully reproduced the exact results from an Intel 10980XE, there is
7115 * always a portition of rounding differences. Not going to spend too much time
7116 * on getting this 100% the same, at least not now.
7117 *
7118 * P.S. If someone are really curious about 8087 and its contstants:
7119 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7120 *
7121 *
7122 * @param pr80Val The exponent value (x), less than 1.0, greater than
7123 * -1.0 and not zero. This can be a normal, denormal
7124 * or pseudo-denormal value.
7125 * @param pr80Result Where to return the result.
7126 * @param fFcw FPU control word.
7127 * @param fFsw FPU status word.
7128 */
7129static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7130{
7131 /* As mentioned above, we can skip the expensive polynomial calculation
7132 as it will be close enough to 1.0 that it makes no difference.
7133
7134 The cutoff point for intel 10980XE is exponents >= -69. Intel
7135 also seems to be using a 67-bit or 68-bit constant value, and we get
7136 a smattering of rounding differences if we go for higher precision. */
7137 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7138 {
7139 RTUINT256U u256;
7140 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7141 u256.QWords.qw0 |= 1; /* force #PE */
7142 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7143 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7144 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7145 : 1 - RTFLOAT80U_EXP_BIAS,
7146 fFcw, fFsw);
7147 }
7148 else
7149 {
7150#ifdef IEM_WITH_FLOAT128_FOR_FPU
7151 /* This approach is not good enough for small values, we end up with zero. */
7152 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7153 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7154 _Float128 rd128Result = powf128(2.0L, rd128Val);
7155 rd128Result -= 1.0L;
7156 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7157 iemFpuF128RestoreRounding(fOldRounding);
7158
7159# else
7160 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7161 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7162
7163 /* As mentioned above, enforce 68-bit internal mantissa width to better
7164 match the Intel 10980XE results. */
7165 unsigned const cPrecision = 68;
7166
7167 /* first calculate z = x * ln2 */
7168 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7169 cPrecision);
7170
7171 /* Then do the polynomial evaluation. */
7172 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7173 cPrecision, &SoftState);
7174 r = f128_mul(z, r, &SoftState);
7175
7176 /* Output the result. */
7177 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7178# endif
7179 }
7180 return fFsw;
7181}
7182
7183
7184IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7185{
7186 uint16_t const fFcw = pFpuState->FCW;
7187 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7188
7189 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7190 {
7191 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7192 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7193 else
7194 {
7195 /* Special case:
7196 2^+1.0 - 1.0 = 1.0
7197 2^-1.0 - 1.0 = -0.5 */
7198 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7199 && pr80Val->s.uMantissa == RT_BIT_64(63))
7200 {
7201 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7202 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7203 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7204 }
7205 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7206 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7207 else
7208 pFpuRes->r80Result = *pr80Val;
7209 fFsw |= X86_FSW_PE;
7210 if (!(fFcw & X86_FCW_PM))
7211 fFsw |= X86_FSW_ES | X86_FSW_B;
7212 }
7213 }
7214 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7215 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7216 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7217 pFpuRes->r80Result = *pr80Val;
7218 else if (RTFLOAT80U_IS_INF(pr80Val))
7219 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7220 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7221 {
7222 fFsw |= X86_FSW_DE;
7223 if (fFcw & X86_FCW_DM)
7224 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7225 else
7226 {
7227 pFpuRes->r80Result = *pr80Val;
7228 fFsw |= X86_FSW_ES | X86_FSW_B;
7229 }
7230 }
7231 else
7232 {
7233 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7234 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7235 && (fFcw & X86_FCW_IM))
7236 pFpuRes->r80Result = g_r80Indefinite;
7237 else
7238 {
7239 pFpuRes->r80Result = *pr80Val;
7240 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7241 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7242 }
7243 fFsw |= X86_FSW_IE;
7244 if (!(fFcw & X86_FCW_IM))
7245 fFsw |= X86_FSW_ES | X86_FSW_B;
7246 }
7247 pFpuRes->FSW = fFsw;
7248}
7249
7250#endif /* IEM_WITHOUT_ASSEMBLY */
7251
7252IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7253{
7254 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7255}
7256
7257IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7258{
7259 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7260}
7261
7262#ifdef IEM_WITHOUT_ASSEMBLY
7263
7264IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7265{
7266 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7267 pFpuRes->r80Result = *pr80Val;
7268 pFpuRes->r80Result.s.fSign = 0;
7269}
7270
7271
7272IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7273{
7274 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7275 pFpuRes->r80Result = *pr80Val;
7276 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7277}
7278
7279
7280IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7281{
7282 uint16_t const fFcw = pFpuState->FCW;
7283 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7284
7285 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7286 {
7287 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7288 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7289
7290 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7291 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7292 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7293 }
7294 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7295 {
7296 fFsw |= X86_FSW_ZE;
7297 if (fFcw & X86_FCW_ZM)
7298 {
7299 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7300 pFpuResTwo->r80Result2 = *pr80Val;
7301 }
7302 else
7303 {
7304 pFpuResTwo->r80Result2 = *pr80Val;
7305 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7306 }
7307 }
7308 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7309 {
7310 fFsw |= X86_FSW_DE;
7311 if (fFcw & X86_FCW_DM)
7312 {
7313 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7314 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7315 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7316 int32_t iExponent = -16382;
7317 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7318 {
7319 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7320 iExponent--;
7321 }
7322
7323 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7324 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7325 }
7326 else
7327 {
7328 pFpuResTwo->r80Result2 = *pr80Val;
7329 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7330 }
7331 }
7332 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7333 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7334 {
7335 pFpuResTwo->r80Result1 = *pr80Val;
7336 pFpuResTwo->r80Result2 = *pr80Val;
7337 }
7338 else if (RTFLOAT80U_IS_INF(pr80Val))
7339 {
7340 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7341 pFpuResTwo->r80Result2 = *pr80Val;
7342 }
7343 else
7344 {
7345 if (fFcw & X86_FCW_IM)
7346 {
7347 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7348 pFpuResTwo->r80Result1 = g_r80Indefinite;
7349 else
7350 {
7351 pFpuResTwo->r80Result1 = *pr80Val;
7352 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7353 }
7354 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7355 }
7356 else
7357 {
7358 pFpuResTwo->r80Result2 = *pr80Val;
7359 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7360 }
7361 fFsw |= X86_FSW_IE;
7362 }
7363 pFpuResTwo->FSW = fFsw;
7364}
7365#endif /* IEM_WITHOUT_ASSEMBLY */
7366
7367#if defined(IEM_WITHOUT_ASSEMBLY)
7368
7369static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7370{
7371 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7372 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7373 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7374 extFloat80_t v;
7375 (void)fFcw;
7376
7377 v = extF80_ylog2x(y, x, &SoftState);
7378 iemFpuSoftF80ToIprt(pr80Result, v);
7379
7380 return fFsw;
7381}
7382
7383IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7384 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7385{
7386 uint16_t const fFcw = pFpuState->FCW;
7387 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7388
7389 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7390 {
7391 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7392
7393 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7394 if (!(fFcw & X86_FCW_PM))
7395 fFsw |= X86_FSW_ES | X86_FSW_B;
7396 }
7397 else
7398 {
7399 fFsw |= X86_FSW_IE;
7400
7401 if (!(fFcw & X86_FCW_IM))
7402 {
7403 pFpuRes->r80Result = *pr80Val2;
7404 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7405 }
7406 else
7407 {
7408 pFpuRes->r80Result = g_r80Indefinite;
7409 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7410 }
7411 }
7412
7413 pFpuRes->FSW = fFsw;
7414}
7415#endif /* IEM_WITHOUT_ASSEMBLY */
7416
7417IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7418 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7419{
7420 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7421}
7422
7423IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7424 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7425{
7426 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7427}
7428
7429#if defined(IEM_WITHOUT_ASSEMBLY)
7430
7431static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7432{
7433 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7434 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7435 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7436 extFloat80_t v;
7437 (void)fFcw;
7438
7439 v = extF80_ylog2xp1(y, x, &SoftState);
7440 iemFpuSoftF80ToIprt(pr80Result, v);
7441
7442 return fFsw;
7443}
7444
7445IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7446 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7447{
7448 uint16_t const fFcw = pFpuState->FCW;
7449 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7450
7451 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7452 {
7453 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7454
7455 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7456 if (!(fFcw & X86_FCW_PM))
7457 fFsw |= X86_FSW_ES | X86_FSW_B;
7458 }
7459 else
7460 {
7461 fFsw |= X86_FSW_IE;
7462
7463 if (!(fFcw & X86_FCW_IM))
7464 {
7465 pFpuRes->r80Result = *pr80Val2;
7466 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7467 }
7468 else
7469 {
7470 pFpuRes->r80Result = g_r80Indefinite;
7471 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7472 }
7473 }
7474
7475 pFpuRes->FSW = fFsw;
7476}
7477
7478#endif /* IEM_WITHOUT_ASSEMBLY */
7479
7480IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7481 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7482{
7483 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7484}
7485
7486IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7487 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7488{
7489 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7490}
7491
7492
7493/*********************************************************************************************************************************
7494* MMX, SSE & AVX *
7495*********************************************************************************************************************************/
7496
7497/*
7498 * MOVSLDUP / VMOVSLDUP
7499 */
7500IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7501{
7502 puDst->au32[0] = puSrc->au32[0];
7503 puDst->au32[1] = puSrc->au32[0];
7504 puDst->au32[2] = puSrc->au32[2];
7505 puDst->au32[3] = puSrc->au32[2];
7506}
7507
7508#ifdef IEM_WITH_VEX
7509
7510IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7511{
7512 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7513 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7514 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7515 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7516 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7517 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7518 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7519 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7520}
7521
7522
7523IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7524{
7525 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7526 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7527 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7528 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7529 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7530 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7531 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7532 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7533}
7534
7535#endif /* IEM_WITH_VEX */
7536
7537
7538/*
7539 * MOVSHDUP / VMOVSHDUP
7540 */
7541IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7542{
7543 puDst->au32[0] = puSrc->au32[1];
7544 puDst->au32[1] = puSrc->au32[1];
7545 puDst->au32[2] = puSrc->au32[3];
7546 puDst->au32[3] = puSrc->au32[3];
7547}
7548
7549#ifdef IEM_WITH_VEX
7550
7551IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7552{
7553 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7554 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7555 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7556 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7557 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7558 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7559 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7560 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7561}
7562
7563
7564IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7565{
7566 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7567 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7568 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7569 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7570 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7571 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7572 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7573 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7574}
7575
7576#endif /* IEM_WITH_VEX */
7577
7578
7579/*
7580 * MOVDDUP / VMOVDDUP
7581 */
7582IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7583{
7584 puDst->au64[0] = uSrc;
7585 puDst->au64[1] = uSrc;
7586}
7587
7588#ifdef IEM_WITH_VEX
7589
7590IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7591{
7592 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7593 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7594 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7595 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7596}
7597
7598IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7599{
7600 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7601 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7602 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7603 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7604}
7605
7606#endif /* IEM_WITH_VEX */
7607
7608
7609/*
7610 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7611 */
7612#ifdef IEM_WITHOUT_ASSEMBLY
7613
7614IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7615{
7616 RT_NOREF(pFpuState);
7617 *puDst &= *puSrc;
7618}
7619
7620
7621IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7622{
7623 RT_NOREF(pFpuState);
7624 puDst->au64[0] &= puSrc->au64[0];
7625 puDst->au64[1] &= puSrc->au64[1];
7626}
7627
7628#endif
7629
7630IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7631 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7632{
7633 RT_NOREF(pExtState);
7634 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7635 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7636}
7637
7638
7639IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7640 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7641{
7642 RT_NOREF(pExtState);
7643 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7644 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7645 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7646 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7647}
7648
7649
7650/*
7651 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7652 */
7653#ifdef IEM_WITHOUT_ASSEMBLY
7654
7655IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7656{
7657 RT_NOREF(pFpuState);
7658 *puDst = ~*puDst & *puSrc;
7659}
7660
7661
7662IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7663{
7664 RT_NOREF(pFpuState);
7665 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7666 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7667}
7668
7669#endif
7670
7671IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7672 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7673{
7674 RT_NOREF(pExtState);
7675 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7676 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7677}
7678
7679
7680IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7681 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7682{
7683 RT_NOREF(pExtState);
7684 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7685 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7686 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7687 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7688}
7689
7690
7691/*
7692 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7693 */
7694#ifdef IEM_WITHOUT_ASSEMBLY
7695
7696IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7697{
7698 RT_NOREF(pFpuState);
7699 *puDst |= *puSrc;
7700}
7701
7702
7703IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7704{
7705 RT_NOREF(pFpuState);
7706 puDst->au64[0] |= puSrc->au64[0];
7707 puDst->au64[1] |= puSrc->au64[1];
7708}
7709
7710#endif
7711
7712IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7713 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7714{
7715 RT_NOREF(pExtState);
7716 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7717 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7718}
7719
7720
7721IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7722 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7723{
7724 RT_NOREF(pExtState);
7725 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7726 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7727 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7728 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7729}
7730
7731
7732/*
7733 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7734 */
7735#ifdef IEM_WITHOUT_ASSEMBLY
7736
7737IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7738{
7739 RT_NOREF(pFpuState);
7740 *puDst ^= *puSrc;
7741}
7742
7743
7744IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7745{
7746 RT_NOREF(pFpuState);
7747 puDst->au64[0] ^= puSrc->au64[0];
7748 puDst->au64[1] ^= puSrc->au64[1];
7749}
7750
7751#endif
7752
7753IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7754 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7755{
7756 RT_NOREF(pExtState);
7757 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7758 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7759}
7760
7761
7762IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7763 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7764{
7765 RT_NOREF(pExtState);
7766 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7767 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7768 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7769 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7770}
7771
7772
7773/*
7774 * PCMPEQB / VPCMPEQB
7775 */
7776#ifdef IEM_WITHOUT_ASSEMBLY
7777
7778IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7779{
7780 RT_NOREF(pFpuState);
7781 RTUINT64U uSrc1 = { *puDst };
7782 RTUINT64U uSrc2 = { *puSrc };
7783 RTUINT64U uDst;
7784 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7785 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7786 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7787 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7788 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7789 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7790 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7791 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7792 *puDst = uDst.u;
7793}
7794
7795
7796IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7797{
7798 RT_NOREF(pFpuState);
7799 RTUINT128U uSrc1 = *puDst;
7800 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7801 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7802 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7803 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7804 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7805 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7806 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7807 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7808 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7809 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7810 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7811 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7812 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7813 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7814 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7815 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7816}
7817
7818#endif
7819
7820IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7821 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7822{
7823 RT_NOREF(pExtState);
7824 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7825 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7826 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7827 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7828 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7829 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7830 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7831 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7832 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7833 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7834 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7835 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7836 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7837 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7838 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7839 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7840}
7841
7842IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7843 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7844{
7845 RT_NOREF(pExtState);
7846 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7847 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7848 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7849 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7850 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7851 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7852 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7853 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7854 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7855 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7856 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7857 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7858 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7859 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7860 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7861 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7862 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7863 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7864 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7865 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7866 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7867 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7868 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7869 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7870 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7871 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7872 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7873 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7874 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7875 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7876 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7877 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7878}
7879
7880
7881/*
7882 * PCMPEQW / VPCMPEQW
7883 */
7884#ifdef IEM_WITHOUT_ASSEMBLY
7885
7886IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7887{
7888 RT_NOREF(pFpuState);
7889 RTUINT64U uSrc1 = { *puDst };
7890 RTUINT64U uSrc2 = { *puSrc };
7891 RTUINT64U uDst;
7892 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7893 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7894 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7895 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7896 *puDst = uDst.u;
7897}
7898
7899
7900IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7901{
7902 RT_NOREF(pFpuState);
7903 RTUINT128U uSrc1 = *puDst;
7904 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7905 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7906 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7907 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7908 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7909 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7910 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7911 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7912}
7913
7914#endif
7915
7916IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7917 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7918{
7919 RT_NOREF(pExtState);
7920 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7921 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7922 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7923 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7924 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7925 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7926 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7927 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7928}
7929
7930IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7931 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7932{
7933 RT_NOREF(pExtState);
7934 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7935 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7936 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7937 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7938 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7939 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7940 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7941 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7942 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7943 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7944 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7945 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7946 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7947 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7948 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7949 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7950}
7951
7952
7953/*
7954 * PCMPEQD / VPCMPEQD.
7955 */
7956#ifdef IEM_WITHOUT_ASSEMBLY
7957
7958IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7959{
7960 RT_NOREF(pFpuState);
7961 RTUINT64U uSrc1 = { *puDst };
7962 RTUINT64U uSrc2 = { *puSrc };
7963 RTUINT64U uDst;
7964 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7965 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7966 *puDst = uDst.u;
7967}
7968
7969
7970IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7971{
7972 RT_NOREF(pFpuState);
7973 RTUINT128U uSrc1 = *puDst;
7974 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7975 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7976 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7977 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7978}
7979
7980#endif /* IEM_WITHOUT_ASSEMBLY */
7981
7982IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7983 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7984{
7985 RT_NOREF(pExtState);
7986 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7987 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7988 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7989 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7990}
7991
7992IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7993 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7994{
7995 RT_NOREF(pExtState);
7996 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7997 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7998 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7999 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8000 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8001 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8002 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8003 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8004}
8005
8006
8007/*
8008 * PCMPEQQ / VPCMPEQQ.
8009 */
8010IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8011{
8012 RT_NOREF(pFpuState);
8013 RTUINT128U uSrc1 = *puDst;
8014 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8015 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8016}
8017
8018IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8019 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8020{
8021 RT_NOREF(pExtState);
8022 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8023 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8024}
8025
8026IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8027 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8028{
8029 RT_NOREF(pExtState);
8030 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8031 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8032 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8033 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8034}
8035
8036
8037/*
8038 * PCMPGTB / VPCMPGTB
8039 */
8040#ifdef IEM_WITHOUT_ASSEMBLY
8041
8042IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8043{
8044 RT_NOREF(pFpuState);
8045 RTUINT64U uSrc1 = { *puDst };
8046 RTUINT64U uSrc2 = { *puSrc };
8047 RTUINT64U uDst;
8048 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8049 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8050 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8051 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8052 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8053 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8054 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8055 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8056 *puDst = uDst.u;
8057}
8058
8059
8060IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8061{
8062 RT_NOREF(pFpuState);
8063 RTUINT128U uSrc1 = *puDst;
8064 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8065 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8066 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8067 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8068 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8069 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8070 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8071 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8072 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8073 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8074 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8075 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8076 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8077 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8078 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8079 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8080}
8081
8082#endif
8083
8084IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8085 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8086{
8087 RT_NOREF(pExtState);
8088 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8089 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8090 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8091 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8092 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8093 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8094 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8095 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8096 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8097 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8098 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8099 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8100 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8101 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8102 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8103 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8104}
8105
8106IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8107 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8108{
8109 RT_NOREF(pExtState);
8110 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8111 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8112 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8113 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8114 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8115 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8116 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8117 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8118 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8119 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8120 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8121 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8122 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8123 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8124 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8125 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8126 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8127 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8128 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8129 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8130 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8131 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8132 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8133 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8134 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8135 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8136 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8137 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8138 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8139 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8140 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8141 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8142}
8143
8144
8145/*
8146 * PCMPGTW / VPCMPGTW
8147 */
8148#ifdef IEM_WITHOUT_ASSEMBLY
8149
8150IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8151{
8152 RT_NOREF(pFpuState);
8153 RTUINT64U uSrc1 = { *puDst };
8154 RTUINT64U uSrc2 = { *puSrc };
8155 RTUINT64U uDst;
8156 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8157 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8158 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8159 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8160 *puDst = uDst.u;
8161}
8162
8163
8164IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8165{
8166 RT_NOREF(pFpuState);
8167 RTUINT128U uSrc1 = *puDst;
8168 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8169 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8170 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8171 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8172 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8173 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8174 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8175 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8176}
8177
8178#endif
8179
8180IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8181 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8182{
8183 RT_NOREF(pExtState);
8184 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8185 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8186 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8187 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8188 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8189 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8190 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8191 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8192}
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8195 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8196{
8197 RT_NOREF(pExtState);
8198 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8199 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8200 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8201 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8202 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8203 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8204 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8205 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8206 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8207 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8208 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8209 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8210 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8211 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8212 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8213 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8214}
8215
8216
8217/*
8218 * PCMPGTD / VPCMPGTD.
8219 */
8220#ifdef IEM_WITHOUT_ASSEMBLY
8221
8222IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8223{
8224 RT_NOREF(pFpuState);
8225 RTUINT64U uSrc1 = { *puDst };
8226 RTUINT64U uSrc2 = { *puSrc };
8227 RTUINT64U uDst;
8228 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8229 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8230 *puDst = uDst.u;
8231}
8232
8233
8234IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8235{
8236 RT_NOREF(pFpuState);
8237 RTUINT128U uSrc1 = *puDst;
8238 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8239 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8240 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8241 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8242}
8243
8244#endif /* IEM_WITHOUT_ASSEMBLY */
8245
8246IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8247 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8248{
8249 RT_NOREF(pExtState);
8250 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8251 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8252 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8253 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8254}
8255
8256IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8257 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8258{
8259 RT_NOREF(pExtState);
8260 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8261 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8262 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8263 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8264 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8265 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8266 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8267 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8268}
8269
8270
8271/*
8272 * PCMPGTQ / VPCMPGTQ.
8273 */
8274IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8275{
8276 RT_NOREF(pFpuState);
8277 RTUINT128U uSrc1 = *puDst;
8278 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8279 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8280}
8281
8282IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8283 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8284{
8285 RT_NOREF(pExtState);
8286 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8287 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8288}
8289
8290IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8291 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8292{
8293 RT_NOREF(pExtState);
8294 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8295 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8296 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8297 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8298}
8299
8300
8301/*
8302 * PADDB / VPADDB
8303 */
8304#ifdef IEM_WITHOUT_ASSEMBLY
8305
8306IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8307{
8308 RT_NOREF(pFpuState);
8309 RTUINT64U uSrc1 = { *puDst };
8310 RTUINT64U uSrc2 = { *puSrc };
8311 RTUINT64U uDst;
8312 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8313 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8314 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8315 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8316 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8317 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8318 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8319 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8320 *puDst = uDst.u;
8321}
8322
8323
8324IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8325{
8326 RT_NOREF(pFpuState);
8327 RTUINT128U uSrc1 = *puDst;
8328 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8329 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8330 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8331 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8332 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8333 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8334 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8335 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8336 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8337 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8338 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8339 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8340 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8341 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8342 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8343 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8344}
8345
8346#endif
8347
8348
8349IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8350 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8351{
8352 RT_NOREF(pExtState);
8353 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8354 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8355 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8356 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8357 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8358 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8359 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8360 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8361 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8362 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8363 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8364 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8365 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8366 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8367 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8368 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8369}
8370
8371IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8372 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8373{
8374 RT_NOREF(pExtState);
8375 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8376 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8377 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8378 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8379 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8380 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8381 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8382 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8383 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8384 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8385 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8386 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8387 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8388 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8389 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8390 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8391 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8392 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8393 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8394 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8395 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8396 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8397 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8398 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8399 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8400 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8401 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8402 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8403 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8404 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8405 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8406 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8407}
8408
8409
8410/*
8411 * PADDSB / VPADDSB
8412 */
8413#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8414 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8415 ? (uint8_t)(a_iWord) \
8416 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8417
8418#ifdef IEM_WITHOUT_ASSEMBLY
8419
8420IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8421{
8422 RT_NOREF(pFpuState);
8423 RTUINT64U uSrc1 = { *puDst };
8424 RTUINT64U uSrc2 = { *puSrc };
8425 RTUINT64U uDst;
8426 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8427 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8428 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8429 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8430 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8431 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8432 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8433 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8434 *puDst = uDst.u;
8435}
8436
8437
8438IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8439{
8440 RT_NOREF(pFpuState);
8441 RTUINT128U uSrc1 = *puDst;
8442 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8443 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8444 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8445 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8446 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8447 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8448 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8449 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8450 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8451 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8452 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8453 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8454 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8455 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8456 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8457 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8458}
8459
8460#endif
8461
8462
8463/*
8464 * PADDSB / VPADDSB
8465 */
8466#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8467 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8468 ? (uint8_t)(a_uWord) \
8469 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8470
8471#ifdef IEM_WITHOUT_ASSEMBLY
8472
8473IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8474{
8475 RT_NOREF(pFpuState);
8476 RTUINT64U uSrc1 = { *puDst };
8477 RTUINT64U uSrc2 = { *puSrc };
8478 RTUINT64U uDst;
8479 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8480 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8481 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8482 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8483 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8484 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8485 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8486 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8487 *puDst = uDst.u;
8488}
8489
8490
8491IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8492{
8493 RT_NOREF(pFpuState);
8494 RTUINT128U uSrc1 = *puDst;
8495 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8496 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8497 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8498 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8499 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8500 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8501 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8502 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8503 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8504 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8505 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8506 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8507 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8508 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8509 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8510 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8511}
8512
8513#endif
8514
8515
8516/*
8517 * PADDW / VPADDW
8518 */
8519#ifdef IEM_WITHOUT_ASSEMBLY
8520
8521IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8522{
8523 RT_NOREF(pFpuState);
8524 RTUINT64U uSrc1 = { *puDst };
8525 RTUINT64U uSrc2 = { *puSrc };
8526 RTUINT64U uDst;
8527 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8528 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8529 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8530 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8531 *puDst = uDst.u;
8532}
8533
8534
8535IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8536{
8537 RT_NOREF(pFpuState);
8538 RTUINT128U uSrc1 = *puDst;
8539 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8540 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8541 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8542 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8543 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8544 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8545 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8546 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8547}
8548
8549#endif
8550
8551
8552IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8553 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8554{
8555 RT_NOREF(pExtState);
8556 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8557 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8558 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8559 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8560 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8561 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8562 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8563 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8564}
8565
8566IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8567 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8568{
8569 RT_NOREF(pExtState);
8570 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8571 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8572 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8573 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8574 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8575 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8576 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8577 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8578 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8579 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8580 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8581 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8582 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8583 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8584 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8585 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8586}
8587
8588
8589/*
8590 * PADDSW / VPADDSW
8591 */
8592#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8593 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8594 ? (uint16_t)(a_iDword) \
8595 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8596
8597#ifdef IEM_WITHOUT_ASSEMBLY
8598
8599IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8600{
8601 RT_NOREF(pFpuState);
8602 RTUINT64U uSrc1 = { *puDst };
8603 RTUINT64U uSrc2 = { *puSrc };
8604 RTUINT64U uDst;
8605 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8606 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8607 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8608 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8609 *puDst = uDst.u;
8610}
8611
8612
8613IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8614{
8615 RT_NOREF(pFpuState);
8616 RTUINT128U uSrc1 = *puDst;
8617 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8618 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8619 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8620 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8621 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8622 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8623 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8624 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8625}
8626
8627#endif
8628
8629
8630/*
8631 * PADDUSW / VPADDUSW
8632 */
8633#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8634 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8635 ? (uint16_t)(a_uDword) \
8636 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8637
8638#ifdef IEM_WITHOUT_ASSEMBLY
8639
8640IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8641{
8642 RT_NOREF(pFpuState);
8643 RTUINT64U uSrc1 = { *puDst };
8644 RTUINT64U uSrc2 = { *puSrc };
8645 RTUINT64U uDst;
8646 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8647 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8648 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8649 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8650 *puDst = uDst.u;
8651}
8652
8653
8654IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8655{
8656 RT_NOREF(pFpuState);
8657 RTUINT128U uSrc1 = *puDst;
8658 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8659 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8660 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8661 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8662 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8663 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8664 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8665 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8666}
8667
8668#endif
8669
8670
8671/*
8672 * PADDD / VPADDD.
8673 */
8674#ifdef IEM_WITHOUT_ASSEMBLY
8675
8676IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8677{
8678 RT_NOREF(pFpuState);
8679 RTUINT64U uSrc1 = { *puDst };
8680 RTUINT64U uSrc2 = { *puSrc };
8681 RTUINT64U uDst;
8682 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8683 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8684 *puDst = uDst.u;
8685}
8686
8687
8688IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8689{
8690 RT_NOREF(pFpuState);
8691 RTUINT128U uSrc1 = *puDst;
8692 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8693 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8694 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8695 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8696}
8697
8698#endif /* IEM_WITHOUT_ASSEMBLY */
8699
8700IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8701 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8702{
8703 RT_NOREF(pExtState);
8704 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8705 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8706 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8707 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8708}
8709
8710IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8711 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8712{
8713 RT_NOREF(pExtState);
8714 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8715 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8716 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8717 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8718 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8719 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8720 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8721 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8722}
8723
8724
8725/*
8726 * PADDQ / VPADDQ.
8727 */
8728#ifdef IEM_WITHOUT_ASSEMBLY
8729
8730IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8731{
8732 RT_NOREF(pFpuState);
8733 *puDst = *puDst + *puSrc;
8734}
8735
8736IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8737{
8738 RT_NOREF(pFpuState);
8739 RTUINT128U uSrc1 = *puDst;
8740 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8741 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8742}
8743
8744#endif
8745
8746IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8747 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8748{
8749 RT_NOREF(pExtState);
8750 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8751 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8752}
8753
8754IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8755 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8756{
8757 RT_NOREF(pExtState);
8758 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8759 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8760 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8761 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8762}
8763
8764
8765/*
8766 * PSUBB / VPSUBB
8767 */
8768#ifdef IEM_WITHOUT_ASSEMBLY
8769
8770IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8771{
8772 RT_NOREF(pFpuState);
8773 RTUINT64U uSrc1 = { *puDst };
8774 RTUINT64U uSrc2 = { *puSrc };
8775 RTUINT64U uDst;
8776 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8777 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8778 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8779 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8780 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8781 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8782 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8783 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8784 *puDst = uDst.u;
8785}
8786
8787
8788IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8789{
8790 RT_NOREF(pFpuState);
8791 RTUINT128U uSrc1 = *puDst;
8792 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8793 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8794 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8795 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8796 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8797 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8798 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8799 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8800 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8801 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8802 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8803 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8804 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8805 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8806 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8807 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8808}
8809
8810#endif
8811
8812IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8813 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8814{
8815 RT_NOREF(pExtState);
8816 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8817 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8818 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8819 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8820 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8821 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8822 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8823 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8824 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8825 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8826 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8827 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8828 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8829 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8830 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8831 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8832}
8833
8834IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8835 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8836{
8837 RT_NOREF(pExtState);
8838 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8839 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8840 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8841 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8842 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8843 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8844 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8845 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8846 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8847 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8848 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8849 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8850 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8851 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8852 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8853 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8854 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8855 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8856 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8857 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8858 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8859 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8860 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8861 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8862 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8863 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8864 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8865 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8866 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8867 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8868 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8869 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8870}
8871
8872
8873/*
8874 * PSUBSB / VSUBSB
8875 */
8876#ifdef IEM_WITHOUT_ASSEMBLY
8877
8878IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8879{
8880 RT_NOREF(pFpuState);
8881 RTUINT64U uSrc1 = { *puDst };
8882 RTUINT64U uSrc2 = { *puSrc };
8883 RTUINT64U uDst;
8884 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8885 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8886 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8887 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8888 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8889 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8890 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8891 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8892 *puDst = uDst.u;
8893}
8894
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8897{
8898 RT_NOREF(pFpuState);
8899 RTUINT128U uSrc1 = *puDst;
8900 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8901 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8902 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8903 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8904 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8905 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8906 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8907 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8908 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8909 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8910 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8911 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8912 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8913 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8914 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8915 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8916}
8917
8918#endif
8919
8920
8921/*
8922 * PADDSB / VPADDSB
8923 */
8924#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8925 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8926 ? (uint8_t)(a_uWord) \
8927 : (uint8_t)0 )
8928
8929#ifdef IEM_WITHOUT_ASSEMBLY
8930
8931IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8932{
8933 RT_NOREF(pFpuState);
8934 RTUINT64U uSrc1 = { *puDst };
8935 RTUINT64U uSrc2 = { *puSrc };
8936 RTUINT64U uDst;
8937 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8938 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8939 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8940 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8941 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8942 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8943 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8944 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8945 *puDst = uDst.u;
8946}
8947
8948
8949IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8950{
8951 RT_NOREF(pFpuState);
8952 RTUINT128U uSrc1 = *puDst;
8953 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8954 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8955 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8956 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8957 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8958 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8959 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8960 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8961 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8962 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8963 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8964 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8965 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8966 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8967 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8968 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8969}
8970
8971#endif
8972
8973
8974/*
8975 * PSUBW / VPSUBW
8976 */
8977#ifdef IEM_WITHOUT_ASSEMBLY
8978
8979IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8980{
8981 RT_NOREF(pFpuState);
8982 RTUINT64U uSrc1 = { *puDst };
8983 RTUINT64U uSrc2 = { *puSrc };
8984 RTUINT64U uDst;
8985 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8986 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8987 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8988 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8989 *puDst = uDst.u;
8990}
8991
8992
8993IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8994{
8995 RT_NOREF(pFpuState);
8996 RTUINT128U uSrc1 = *puDst;
8997 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8998 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8999 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9000 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9001 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9002 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9003 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9004 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9005}
9006
9007#endif
9008
9009IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9010 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9011{
9012 RT_NOREF(pExtState);
9013 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9014 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9015 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9016 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9017 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9018 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9019 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9020 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9021}
9022
9023IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9024 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9025{
9026 RT_NOREF(pExtState);
9027 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9028 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9029 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9030 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9031 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9032 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9033 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9034 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9035 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9036 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9037 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9038 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9039 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9040 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9041 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9042 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9043}
9044
9045
9046/*
9047 * PSUBSW / VPSUBSW
9048 */
9049#ifdef IEM_WITHOUT_ASSEMBLY
9050
9051IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9052{
9053 RT_NOREF(pFpuState);
9054 RTUINT64U uSrc1 = { *puDst };
9055 RTUINT64U uSrc2 = { *puSrc };
9056 RTUINT64U uDst;
9057 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9058 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9059 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9060 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9061 *puDst = uDst.u;
9062}
9063
9064
9065IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9066{
9067 RT_NOREF(pFpuState);
9068 RTUINT128U uSrc1 = *puDst;
9069 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9070 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9071 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9072 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9073 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9074 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9075 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9076 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9077}
9078
9079#endif
9080
9081
9082/*
9083 * PSUBUSW / VPSUBUSW
9084 */
9085#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9086 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9087 ? (uint16_t)(a_uDword) \
9088 : (uint16_t)0 )
9089
9090#ifdef IEM_WITHOUT_ASSEMBLY
9091
9092IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9093{
9094 RT_NOREF(pFpuState);
9095 RTUINT64U uSrc1 = { *puDst };
9096 RTUINT64U uSrc2 = { *puSrc };
9097 RTUINT64U uDst;
9098 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9099 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9100 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9101 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9102 *puDst = uDst.u;
9103}
9104
9105
9106IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9107{
9108 RT_NOREF(pFpuState);
9109 RTUINT128U uSrc1 = *puDst;
9110 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9111 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9112 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9113 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9114 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9115 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9116 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9117 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9118}
9119
9120#endif
9121
9122
9123/*
9124 * PSUBD / VPSUBD.
9125 */
9126#ifdef IEM_WITHOUT_ASSEMBLY
9127
9128IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9129{
9130 RT_NOREF(pFpuState);
9131 RTUINT64U uSrc1 = { *puDst };
9132 RTUINT64U uSrc2 = { *puSrc };
9133 RTUINT64U uDst;
9134 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9135 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9136 *puDst = uDst.u;
9137}
9138
9139
9140IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9141{
9142 RT_NOREF(pFpuState);
9143 RTUINT128U uSrc1 = *puDst;
9144 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9145 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9146 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9147 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9148}
9149
9150#endif /* IEM_WITHOUT_ASSEMBLY */
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9153 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9154{
9155 RT_NOREF(pExtState);
9156 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9157 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9158 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9159 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9160}
9161
9162IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9163 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9164{
9165 RT_NOREF(pExtState);
9166 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9167 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9168 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9169 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9170 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9171 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9172 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9173 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9174}
9175
9176
9177/*
9178 * PSUBQ / VPSUBQ.
9179 */
9180#ifdef IEM_WITHOUT_ASSEMBLY
9181
9182IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9183{
9184 RT_NOREF(pFpuState);
9185 *puDst = *puDst - *puSrc;
9186}
9187
9188IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9189{
9190 RT_NOREF(pFpuState);
9191 RTUINT128U uSrc1 = *puDst;
9192 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9193 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9194}
9195
9196#endif
9197
9198IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9199 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9200{
9201 RT_NOREF(pExtState);
9202 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9203 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9204}
9205
9206IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9207 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9208{
9209 RT_NOREF(pExtState);
9210 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9211 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9212 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9213 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9214}
9215
9216
9217
9218/*
9219 * PMULLW / VPMULLW / PMULLD / VPMULLD
9220 */
9221#ifdef IEM_WITHOUT_ASSEMBLY
9222
9223IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9224{
9225 RT_NOREF(pFpuState);
9226 RTUINT64U uSrc1 = { *puDst };
9227 RTUINT64U uSrc2 = { *puSrc };
9228 RTUINT64U uDst;
9229 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9230 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9231 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9232 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9233 *puDst = uDst.u;
9234}
9235
9236
9237IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9238{
9239 RT_NOREF(pFpuState);
9240 RTUINT128U uSrc1 = *puDst;
9241 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9242 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9243 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9244 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9245 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9246 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9247 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9248 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9249}
9250
9251#endif
9252
9253IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9254{
9255 RTUINT128U uSrc1 = *puDst;
9256
9257 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9258 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9259 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9260 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9261 RT_NOREF(pFpuState);
9262}
9263
9264
9265IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9266{
9267 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9268 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9269 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9270 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9271 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9272 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9273 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9274 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9275}
9276
9277
9278IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9279{
9280 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9281 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9282 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9283 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9284 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9285 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9286 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9287 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9288 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9289 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9290 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9291 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9292 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9293 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9294 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9295 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9296}
9297
9298
9299IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9300{
9301 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9302 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9303 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9304 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9305}
9306
9307
9308IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9309{
9310 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9311 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9312 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9313 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9314 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9315 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9316 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9317 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9318}
9319
9320
9321/*
9322 * PMULHW / VPMULHW
9323 */
9324#ifdef IEM_WITHOUT_ASSEMBLY
9325
9326IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9327{
9328 RT_NOREF(pFpuState);
9329 RTUINT64U uSrc1 = { *puDst };
9330 RTUINT64U uSrc2 = { *puSrc };
9331 RTUINT64U uDst;
9332 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9333 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9334 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9335 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9336 *puDst = uDst.u;
9337}
9338
9339
9340IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9341{
9342 RT_NOREF(pFpuState);
9343 RTUINT128U uSrc1 = *puDst;
9344 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9345 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9346 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9347 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9348 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9349 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9350 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9351 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9352}
9353
9354#endif
9355
9356IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9357{
9358 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9359 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9360 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9361 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9362 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9363 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9364 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9365 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9366}
9367
9368
9369IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9370{
9371 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9372 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9373 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9374 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9375 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9376 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9377 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9378 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9379 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9380 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9381 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9382 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9383 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9384 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9385 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9386 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9387}
9388
9389
9390/*
9391 * PMULHUW / VPMULHUW
9392 */
9393#ifdef IEM_WITHOUT_ASSEMBLY
9394
9395IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9396{
9397 RTUINT64U uSrc1 = { *puDst };
9398 RTUINT64U uSrc2 = { *puSrc };
9399 RTUINT64U uDst;
9400 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9401 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9402 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9403 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9404 *puDst = uDst.u;
9405}
9406
9407
9408IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9409{
9410 RTUINT128U uSrc1 = *puDst;
9411 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9412 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9413 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9414 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9415 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9416 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9417 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9418 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9419}
9420
9421#endif
9422
9423IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9424{
9425 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9426 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9427 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9428 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9429 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9430 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9431 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9432 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9433}
9434
9435
9436IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9437{
9438 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9439 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9440 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9441 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9442 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9443 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9444 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9445 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9446 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9447 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9448 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9449 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9450 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9451 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9452 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9453 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9454}
9455
9456
9457/*
9458 * PSRLW / VPSRLW
9459 */
9460#ifdef IEM_WITHOUT_ASSEMBLY
9461
9462IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9463{
9464 RTUINT64U uSrc1 = { *puDst };
9465 RTUINT64U uSrc2 = { *puSrc };
9466 RTUINT64U uDst;
9467
9468 if (uSrc2.au64[0] <= 15)
9469 {
9470 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9471 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9472 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9473 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9474 }
9475 else
9476 {
9477 uDst.au64[0] = 0;
9478 }
9479 *puDst = uDst.u;
9480}
9481
9482
9483IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9484{
9485 RTUINT64U uSrc1 = { *puDst };
9486 RTUINT64U uDst;
9487
9488 if (uShift <= 15)
9489 {
9490 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9491 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9492 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9493 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9494 }
9495 else
9496 {
9497 uDst.au64[0] = 0;
9498 }
9499 *puDst = uDst.u;
9500}
9501
9502
9503IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9504{
9505 RTUINT128U uSrc1 = *puDst;
9506
9507 if (puSrc->au64[0] <= 15)
9508 {
9509 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9510 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9511 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9512 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9513 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9514 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9515 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9516 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9517 }
9518 else
9519 {
9520 puDst->au64[0] = 0;
9521 puDst->au64[1] = 0;
9522 }
9523}
9524
9525IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9526{
9527 RTUINT128U uSrc1 = *puDst;
9528
9529 if (uShift <= 15)
9530 {
9531 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9532 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9533 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9534 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9535 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9536 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9537 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9538 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9539 }
9540 else
9541 {
9542 puDst->au64[0] = 0;
9543 puDst->au64[1] = 0;
9544 }
9545}
9546
9547#endif
9548
9549
9550/*
9551 * PSRAW / VPSRAW
9552 */
9553#ifdef IEM_WITHOUT_ASSEMBLY
9554
9555IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9556{
9557 RTUINT64U uSrc1 = { *puDst };
9558 RTUINT64U uSrc2 = { *puSrc };
9559 RTUINT64U uDst;
9560
9561 if (uSrc2.au64[0] <= 15)
9562 {
9563 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9564 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9565 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9566 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9567 }
9568 else
9569 {
9570 uDst.au64[0] = 0;
9571 }
9572 *puDst = uDst.u;
9573}
9574
9575
9576IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9577{
9578 RTUINT64U uSrc1 = { *puDst };
9579 RTUINT64U uDst;
9580
9581 if (uShift <= 15)
9582 {
9583 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9584 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9585 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9586 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9587 }
9588 else
9589 {
9590 uDst.au64[0] = 0;
9591 }
9592 *puDst = uDst.u;
9593}
9594
9595
9596IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9597{
9598 RTUINT128U uSrc1 = *puDst;
9599
9600 if (puSrc->au64[0] <= 15)
9601 {
9602 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9603 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9604 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9605 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9606 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9607 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9608 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9609 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9610 }
9611 else
9612 {
9613 puDst->au64[0] = 0;
9614 puDst->au64[1] = 0;
9615 }
9616}
9617
9618IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9619{
9620 RTUINT128U uSrc1 = *puDst;
9621
9622 if (uShift <= 15)
9623 {
9624 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9625 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9626 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9627 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9628 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9629 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9630 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9631 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9632 }
9633 else
9634 {
9635 puDst->au64[0] = 0;
9636 puDst->au64[1] = 0;
9637 }
9638}
9639
9640#endif
9641
9642
9643/*
9644 * PSLLW / VPSLLW
9645 */
9646#ifdef IEM_WITHOUT_ASSEMBLY
9647
9648IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9649{
9650 RTUINT64U uSrc1 = { *puDst };
9651 RTUINT64U uSrc2 = { *puSrc };
9652 RTUINT64U uDst;
9653
9654 if (uSrc2.au64[0] <= 15)
9655 {
9656 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9657 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9658 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9659 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9660 }
9661 else
9662 {
9663 uDst.au64[0] = 0;
9664 }
9665 *puDst = uDst.u;
9666}
9667
9668
9669IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9670{
9671 RTUINT64U uSrc1 = { *puDst };
9672 RTUINT64U uDst;
9673
9674 if (uShift <= 15)
9675 {
9676 uDst.au16[0] = uSrc1.au16[0] << uShift;
9677 uDst.au16[1] = uSrc1.au16[1] << uShift;
9678 uDst.au16[2] = uSrc1.au16[2] << uShift;
9679 uDst.au16[3] = uSrc1.au16[3] << uShift;
9680 }
9681 else
9682 {
9683 uDst.au64[0] = 0;
9684 }
9685 *puDst = uDst.u;
9686}
9687
9688
9689IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9690{
9691 RTUINT128U uSrc1 = *puDst;
9692
9693 if (puSrc->au64[0] <= 15)
9694 {
9695 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9696 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9697 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9698 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9699 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9700 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9701 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9702 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9703 }
9704 else
9705 {
9706 puDst->au64[0] = 0;
9707 puDst->au64[1] = 0;
9708 }
9709}
9710
9711IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9712{
9713 RTUINT128U uSrc1 = *puDst;
9714
9715 if (uShift <= 15)
9716 {
9717 puDst->au16[0] = uSrc1.au16[0] << uShift;
9718 puDst->au16[1] = uSrc1.au16[1] << uShift;
9719 puDst->au16[2] = uSrc1.au16[2] << uShift;
9720 puDst->au16[3] = uSrc1.au16[3] << uShift;
9721 puDst->au16[4] = uSrc1.au16[4] << uShift;
9722 puDst->au16[5] = uSrc1.au16[5] << uShift;
9723 puDst->au16[6] = uSrc1.au16[6] << uShift;
9724 puDst->au16[7] = uSrc1.au16[7] << uShift;
9725 }
9726 else
9727 {
9728 puDst->au64[0] = 0;
9729 puDst->au64[1] = 0;
9730 }
9731}
9732
9733#endif
9734
9735
9736/*
9737 * PSRLD / VPSRLD
9738 */
9739#ifdef IEM_WITHOUT_ASSEMBLY
9740
9741IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9742{
9743 RTUINT64U uSrc1 = { *puDst };
9744 RTUINT64U uSrc2 = { *puSrc };
9745 RTUINT64U uDst;
9746
9747 if (uSrc2.au64[0] <= 31)
9748 {
9749 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9750 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9751 }
9752 else
9753 {
9754 uDst.au64[0] = 0;
9755 }
9756 *puDst = uDst.u;
9757}
9758
9759
9760IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9761{
9762 RTUINT64U uSrc1 = { *puDst };
9763 RTUINT64U uDst;
9764
9765 if (uShift <= 31)
9766 {
9767 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9768 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9769 }
9770 else
9771 {
9772 uDst.au64[0] = 0;
9773 }
9774 *puDst = uDst.u;
9775}
9776
9777
9778IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9779{
9780 RTUINT128U uSrc1 = *puDst;
9781
9782 if (puSrc->au64[0] <= 31)
9783 {
9784 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9785 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9786 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9787 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9788 }
9789 else
9790 {
9791 puDst->au64[0] = 0;
9792 puDst->au64[1] = 0;
9793 }
9794}
9795
9796IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9797{
9798 RTUINT128U uSrc1 = *puDst;
9799
9800 if (uShift <= 31)
9801 {
9802 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9803 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9804 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9805 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9806 }
9807 else
9808 {
9809 puDst->au64[0] = 0;
9810 puDst->au64[1] = 0;
9811 }
9812}
9813
9814#endif
9815
9816
9817/*
9818 * PSRAD / VPSRAD
9819 */
9820#ifdef IEM_WITHOUT_ASSEMBLY
9821
9822IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9823{
9824 RTUINT64U uSrc1 = { *puDst };
9825 RTUINT64U uSrc2 = { *puSrc };
9826 RTUINT64U uDst;
9827
9828 if (uSrc2.au64[0] <= 31)
9829 {
9830 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9831 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9832 }
9833 else
9834 {
9835 uDst.au64[0] = 0;
9836 }
9837 *puDst = uDst.u;
9838}
9839
9840
9841IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9842{
9843 RTUINT64U uSrc1 = { *puDst };
9844 RTUINT64U uDst;
9845
9846 if (uShift <= 31)
9847 {
9848 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9849 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9850 }
9851 else
9852 {
9853 uDst.au64[0] = 0;
9854 }
9855 *puDst = uDst.u;
9856}
9857
9858
9859IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9860{
9861 RTUINT128U uSrc1 = *puDst;
9862
9863 if (puSrc->au64[0] <= 31)
9864 {
9865 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9866 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9867 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9868 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9869 }
9870 else
9871 {
9872 puDst->au64[0] = 0;
9873 puDst->au64[1] = 0;
9874 }
9875}
9876
9877IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9878{
9879 RTUINT128U uSrc1 = *puDst;
9880
9881 if (uShift <= 31)
9882 {
9883 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9884 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9885 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9886 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9887 }
9888 else
9889 {
9890 puDst->au64[0] = 0;
9891 puDst->au64[1] = 0;
9892 }
9893}
9894
9895#endif
9896
9897
9898/*
9899 * PSLLD / VPSLLD
9900 */
9901#ifdef IEM_WITHOUT_ASSEMBLY
9902
9903IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9904{
9905 RTUINT64U uSrc1 = { *puDst };
9906 RTUINT64U uSrc2 = { *puSrc };
9907 RTUINT64U uDst;
9908
9909 if (uSrc2.au64[0] <= 31)
9910 {
9911 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9912 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9913 }
9914 else
9915 {
9916 uDst.au64[0] = 0;
9917 }
9918 *puDst = uDst.u;
9919}
9920
9921
9922IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9923{
9924 RTUINT64U uSrc1 = { *puDst };
9925 RTUINT64U uDst;
9926
9927 if (uShift <= 31)
9928 {
9929 uDst.au32[0] = uSrc1.au32[0] << uShift;
9930 uDst.au32[1] = uSrc1.au32[1] << uShift;
9931 }
9932 else
9933 {
9934 uDst.au64[0] = 0;
9935 }
9936 *puDst = uDst.u;
9937}
9938
9939
9940IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9941{
9942 RTUINT128U uSrc1 = *puDst;
9943
9944 if (puSrc->au64[0] <= 31)
9945 {
9946 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9947 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9948 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9949 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9950 }
9951 else
9952 {
9953 puDst->au64[0] = 0;
9954 puDst->au64[1] = 0;
9955 }
9956}
9957
9958IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9959{
9960 RTUINT128U uSrc1 = *puDst;
9961
9962 if (uShift <= 31)
9963 {
9964 puDst->au32[0] = uSrc1.au32[0] << uShift;
9965 puDst->au32[1] = uSrc1.au32[1] << uShift;
9966 puDst->au32[2] = uSrc1.au32[2] << uShift;
9967 puDst->au32[3] = uSrc1.au32[3] << uShift;
9968 }
9969 else
9970 {
9971 puDst->au64[0] = 0;
9972 puDst->au64[1] = 0;
9973 }
9974}
9975
9976#endif
9977
9978
9979/*
9980 * PSRLQ / VPSRLQ
9981 */
9982#ifdef IEM_WITHOUT_ASSEMBLY
9983
9984IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9985{
9986 RTUINT64U uSrc1 = { *puDst };
9987 RTUINT64U uSrc2 = { *puSrc };
9988 RTUINT64U uDst;
9989
9990 if (uSrc2.au64[0] <= 63)
9991 {
9992 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9993 }
9994 else
9995 {
9996 uDst.au64[0] = 0;
9997 }
9998 *puDst = uDst.u;
9999}
10000
10001
10002IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10003{
10004 RTUINT64U uSrc1 = { *puDst };
10005 RTUINT64U uDst;
10006
10007 if (uShift <= 63)
10008 {
10009 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10010 }
10011 else
10012 {
10013 uDst.au64[0] = 0;
10014 }
10015 *puDst = uDst.u;
10016}
10017
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10020{
10021 RTUINT128U uSrc1 = *puDst;
10022
10023 if (puSrc->au64[0] <= 63)
10024 {
10025 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10026 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10027 }
10028 else
10029 {
10030 puDst->au64[0] = 0;
10031 puDst->au64[1] = 0;
10032 }
10033}
10034
10035IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10036{
10037 RTUINT128U uSrc1 = *puDst;
10038
10039 if (uShift <= 63)
10040 {
10041 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10042 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10043 }
10044 else
10045 {
10046 puDst->au64[0] = 0;
10047 puDst->au64[1] = 0;
10048 }
10049}
10050
10051#endif
10052
10053
10054/*
10055 * PSLLQ / VPSLLQ
10056 */
10057#ifdef IEM_WITHOUT_ASSEMBLY
10058
10059IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10060{
10061 RTUINT64U uSrc1 = { *puDst };
10062 RTUINT64U uSrc2 = { *puSrc };
10063 RTUINT64U uDst;
10064
10065 if (uSrc2.au64[0] <= 63)
10066 {
10067 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10068 }
10069 else
10070 {
10071 uDst.au64[0] = 0;
10072 }
10073 *puDst = uDst.u;
10074}
10075
10076
10077IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10078{
10079 RTUINT64U uSrc1 = { *puDst };
10080 RTUINT64U uDst;
10081
10082 if (uShift <= 63)
10083 {
10084 uDst.au64[0] = uSrc1.au64[0] << uShift;
10085 }
10086 else
10087 {
10088 uDst.au64[0] = 0;
10089 }
10090 *puDst = uDst.u;
10091}
10092
10093
10094IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10095{
10096 RTUINT128U uSrc1 = *puDst;
10097
10098 if (puSrc->au64[0] <= 63)
10099 {
10100 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10101 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10102 }
10103 else
10104 {
10105 puDst->au64[0] = 0;
10106 puDst->au64[1] = 0;
10107 }
10108}
10109
10110IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10111{
10112 RTUINT128U uSrc1 = *puDst;
10113
10114 if (uShift <= 63)
10115 {
10116 puDst->au64[0] = uSrc1.au64[0] << uShift;
10117 puDst->au64[1] = uSrc1.au64[1] << uShift;
10118 }
10119 else
10120 {
10121 puDst->au64[0] = 0;
10122 puDst->au64[1] = 0;
10123 }
10124}
10125
10126#endif
10127
10128
10129/*
10130 * PSRLDQ / VPSRLDQ
10131 */
10132#ifdef IEM_WITHOUT_ASSEMBLY
10133
10134IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10135{
10136 RTUINT128U uSrc1 = *puDst;
10137
10138 if (uShift < 16)
10139 {
10140 int i;
10141
10142 for (i = 0; i < 16 - uShift; ++i)
10143 puDst->au8[i] = uSrc1.au8[i + uShift];
10144 for (i = 16 - uShift; i < 16; ++i)
10145 puDst->au8[i] = 0;
10146 }
10147 else
10148 {
10149 puDst->au64[0] = 0;
10150 puDst->au64[1] = 0;
10151 }
10152}
10153
10154#endif
10155
10156
10157/*
10158 * PSLLDQ / VPSLLDQ
10159 */
10160#ifdef IEM_WITHOUT_ASSEMBLY
10161
10162IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10163{
10164 RTUINT128U uSrc1 = *puDst;
10165
10166 if (uShift < 16)
10167 {
10168 int i;
10169
10170 for (i = 0; i < uShift; ++i)
10171 puDst->au8[i] = 0;
10172 for (i = uShift; i < 16; ++i)
10173 puDst->au8[i] = uSrc1.au8[i - uShift];
10174 }
10175 else
10176 {
10177 puDst->au64[0] = 0;
10178 puDst->au64[1] = 0;
10179 }
10180}
10181
10182#endif
10183
10184
10185/*
10186 * PMADDWD / VPMADDWD
10187 */
10188#ifdef IEM_WITHOUT_ASSEMBLY
10189
10190IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10191{
10192 RTUINT64U uSrc1 = { *puDst };
10193 RTUINT64U uSrc2 = { *puSrc };
10194 RTUINT64U uDst;
10195
10196 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10197 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10198 *puDst = uDst.u;
10199 RT_NOREF(pFpuState);
10200}
10201
10202
10203IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10204{
10205 RTUINT128U uSrc1 = *puDst;
10206
10207 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10208 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10209 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10210 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10211 RT_NOREF(pFpuState);
10212}
10213
10214#endif
10215
10216
10217/*
10218 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10219 */
10220#ifdef IEM_WITHOUT_ASSEMBLY
10221
10222IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10223{
10224 RTUINT64U uSrc1 = { *puDst };
10225 RTUINT64U uSrc2 = { *puSrc };
10226 RTUINT64U uDst;
10227
10228 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10229 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10230 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10231 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10232 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10233 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10234 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10235 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10236 *puDst = uDst.u;
10237 RT_NOREF(pFpuState);
10238}
10239
10240
10241IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10242{
10243 RTUINT128U uSrc1 = *puDst;
10244
10245 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10246 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10247 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10248 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10249 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10250 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10251 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10252 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10253 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10254 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10255 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10256 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10257 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10258 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10259 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10260 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10261 RT_NOREF(pFpuState);
10262}
10263
10264#endif
10265
10266
10267IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10268{
10269 RTUINT128U uSrc1 = *puDst;
10270
10271 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10272 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10273 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10274 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10275 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10276 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10277 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10278 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10279 RT_NOREF(pFpuState);
10280}
10281
10282
10283IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10284{
10285 RTUINT128U uSrc1 = *puDst;
10286
10287 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10288 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10289 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10290 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10291 RT_NOREF(pFpuState);
10292}
10293
10294
10295IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10296 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10297{
10298 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10299 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10300 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10301 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10302 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10303 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10304 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10305 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10306 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10307 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10308 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10309 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10310 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10311 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10312 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10313 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10314 RT_NOREF(pExtState);
10315}
10316
10317
10318IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10319 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10320{
10321 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10322 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10323 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10324 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10325 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10326 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10327 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10328 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10329 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10330 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10331 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10332 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10333 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10334 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10335 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10336 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10337 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10338 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10339 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10340 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10341 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10342 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10343 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10344 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10345 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10346 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10347 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10348 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10349 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10350 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10351 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10352 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10353 RT_NOREF(pExtState);
10354}
10355
10356
10357IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10358 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10359{
10360 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10361 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10362 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10363 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10364 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10365 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10366 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10367 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10368 RT_NOREF(pExtState);
10369}
10370
10371
10372IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10373 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10374{
10375 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10376 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10377 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10378 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10379 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10380 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10381 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10382 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10383 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10384 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10385 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10386 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10387 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10388 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10389 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10390 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10391 RT_NOREF(pExtState);
10392}
10393
10394
10395IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10396 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10397{
10398 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10399 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10400 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10401 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10402 RT_NOREF(pExtState);
10403}
10404
10405
10406IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10407 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10408{
10409 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10410 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10411 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10412 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10413 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10414 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10415 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10416 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10417 RT_NOREF(pExtState);
10418}
10419
10420
10421/*
10422 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10423 */
10424#ifdef IEM_WITHOUT_ASSEMBLY
10425
10426IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10427{
10428 RTUINT64U uSrc1 = { *puDst };
10429 RTUINT64U uSrc2 = { *puSrc };
10430 RTUINT64U uDst;
10431
10432 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10433 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10434 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10435 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10436 *puDst = uDst.u;
10437 RT_NOREF(pFpuState);
10438}
10439
10440
10441IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10442{
10443 RTUINT128U uSrc1 = *puDst;
10444
10445 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10446 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10447 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10448 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10449 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10450 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10451 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10452 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10453 RT_NOREF(pFpuState);
10454}
10455
10456#endif
10457
10458IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10459{
10460 RTUINT128U uSrc1 = *puDst;
10461
10462 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10463 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10464 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10465 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10466 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10467 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10468 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10469 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10470 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10471 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10472 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10473 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10474 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10475 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10476 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10477 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10478 RT_NOREF(pFpuState);
10479}
10480
10481
10482IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10483{
10484 RTUINT128U uSrc1 = *puDst;
10485
10486 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10487 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10488 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10489 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10490 RT_NOREF(pFpuState);
10491}
10492
10493
10494IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10495 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10496{
10497 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10498 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10499 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10500 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10501 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10502 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10503 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10504 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10505 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10506 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10507 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10508 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10509 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10510 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10511 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10512 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10513 RT_NOREF(pExtState);
10514}
10515
10516
10517IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10518 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10519{
10520 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10521 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10522 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10523 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10524 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10525 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10526 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10527 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10528 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10529 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10530 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10531 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10532 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10533 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10534 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10535 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10536 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10537 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10538 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10539 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10540 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10541 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10542 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10543 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10544 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10545 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10546 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10547 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10548 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10549 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10550 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10551 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10552 RT_NOREF(pExtState);
10553}
10554
10555
10556IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10557 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10558{
10559 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10560 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10561 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10562 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10563 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10564 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10565 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10566 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10567 RT_NOREF(pExtState);
10568}
10569
10570
10571IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10572 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10573{
10574 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10575 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10576 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10577 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10578 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10579 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10580 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10581 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10582 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10583 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10584 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10585 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10586 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10587 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10588 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10589 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10590 RT_NOREF(pExtState);
10591}
10592
10593
10594IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10595 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10596{
10597 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10598 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10599 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10600 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10601 RT_NOREF(pExtState);
10602}
10603
10604
10605IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10606 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10607{
10608 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10609 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10610 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10611 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10612 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10613 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10614 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10615 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10616 RT_NOREF(pExtState);
10617}
10618
10619
10620/*
10621 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10622 */
10623#ifdef IEM_WITHOUT_ASSEMBLY
10624
10625IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10626{
10627 RTUINT64U uSrc1 = { *puDst };
10628 RTUINT64U uSrc2 = { *puSrc };
10629 RTUINT64U uDst;
10630
10631 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10632 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10633 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10634 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10635 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10636 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10637 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10638 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10639 *puDst = uDst.u;
10640 RT_NOREF(pFpuState);
10641}
10642
10643
10644IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10645{
10646 RTUINT128U uSrc1 = *puDst;
10647
10648 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10649 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10650 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10651 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10652 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10653 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10654 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10655 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10656 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10657 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10658 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10659 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10660 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10661 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10662 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10663 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10664 RT_NOREF(pFpuState);
10665}
10666
10667#endif
10668
10669IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10670{
10671 RTUINT128U uSrc1 = *puDst;
10672
10673 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10674 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10675 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10676 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10677 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10678 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10679 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10680 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10681 RT_NOREF(pFpuState);
10682}
10683
10684
10685IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10686{
10687 RTUINT128U uSrc1 = *puDst;
10688
10689 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10690 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10691 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10692 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10693 RT_NOREF(pFpuState);
10694}
10695
10696
10697IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10698 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10699{
10700 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10701 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10702 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10703 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10704 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10705 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10706 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10707 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10708 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10709 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10710 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10711 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10712 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10713 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10714 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10715 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10716 RT_NOREF(pExtState);
10717}
10718
10719
10720IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10721 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10722{
10723 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10724 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10725 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10726 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10727 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10728 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10729 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10730 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10731 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10732 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10733 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10734 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10735 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10736 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10737 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10738 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10739 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10740 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10741 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10742 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10743 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10744 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10745 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10746 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10747 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10748 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10749 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10750 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10751 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10752 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10753 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10754 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10755 RT_NOREF(pExtState);
10756}
10757
10758
10759IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10760 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10761{
10762 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10763 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10764 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10765 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10766 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10767 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10768 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10769 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10770 RT_NOREF(pExtState);
10771}
10772
10773
10774IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10775 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10776{
10777 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10778 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10779 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10780 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10781 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10782 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10783 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10784 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10785 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10786 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10787 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10788 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10789 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10790 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10791 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10792 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10793 RT_NOREF(pExtState);
10794}
10795
10796
10797IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10798 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10799{
10800 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10801 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10802 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10803 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10804 RT_NOREF(pExtState);
10805}
10806
10807
10808IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10810{
10811 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10812 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10813 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10814 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10815 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10816 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10817 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10818 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10819 RT_NOREF(pExtState);
10820}
10821
10822
10823/*
10824 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10825 */
10826#ifdef IEM_WITHOUT_ASSEMBLY
10827
10828IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10829{
10830 RTUINT64U uSrc1 = { *puDst };
10831 RTUINT64U uSrc2 = { *puSrc };
10832 RTUINT64U uDst;
10833
10834 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10835 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10836 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10837 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10838 *puDst = uDst.u;
10839 RT_NOREF(pFpuState);
10840}
10841
10842
10843IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10844{
10845 RTUINT128U uSrc1 = *puDst;
10846
10847 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10848 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10849 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10850 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10851 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10852 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10853 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10854 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10855 RT_NOREF(pFpuState);
10856}
10857
10858#endif
10859
10860IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10861{
10862 RTUINT128U uSrc1 = *puDst;
10863
10864 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10865 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10866 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10867 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10868 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10869 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10870 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10871 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10872 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10873 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10874 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10875 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10876 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10877 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10878 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10879 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10880 RT_NOREF(pFpuState);
10881}
10882
10883
10884IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10885{
10886 RTUINT128U uSrc1 = *puDst;
10887
10888 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10889 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10890 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10891 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10892 RT_NOREF(pFpuState);
10893}
10894
10895
10896IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10897 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10898{
10899 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10900 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10901 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10902 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10903 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10904 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10905 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10906 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10907 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10908 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10909 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10910 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10911 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10912 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10913 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10914 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10915 RT_NOREF(pExtState);
10916}
10917
10918
10919IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10920 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10921{
10922 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10923 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10924 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10925 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10926 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10927 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10928 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10929 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10930 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10931 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10932 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10933 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10934 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10935 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10936 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10937 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10938 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10939 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10940 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10941 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10942 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10943 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10944 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10945 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10946 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10947 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10948 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10949 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10950 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10951 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10952 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10953 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10954 RT_NOREF(pExtState);
10955}
10956
10957
10958IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10959 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10960{
10961 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10962 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10963 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10964 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10965 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10966 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10967 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10968 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10969 RT_NOREF(pExtState);
10970}
10971
10972
10973IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10974 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10975{
10976 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10977 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10978 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10979 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10980 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10981 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10982 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10983 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10984 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10985 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10986 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10987 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10988 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10989 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10990 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10991 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10992 RT_NOREF(pExtState);
10993}
10994
10995
10996IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10997 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10998{
10999 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11000 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11001 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11002 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11003 RT_NOREF(pExtState);
11004}
11005
11006
11007IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11008 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11009{
11010 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11011 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11012 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11013 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11014 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11015 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11016 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11017 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11018 RT_NOREF(pExtState);
11019}
11020
11021
11022/*
11023 * PAVGB / VPAVGB / PAVGW / VPAVGW
11024 */
11025#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11026#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11027
11028#ifdef IEM_WITHOUT_ASSEMBLY
11029
11030IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11031{
11032 RTUINT64U uSrc1 = { *puDst };
11033 RTUINT64U uSrc2 = { *puSrc };
11034 RTUINT64U uDst;
11035
11036 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11037 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11038 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11039 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11040 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11041 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11042 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11043 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11044 *puDst = uDst.u;
11045}
11046
11047
11048IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11049{
11050 RTUINT128U uSrc1 = *puDst;
11051
11052 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11053 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11054 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11055 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11056 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11057 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11058 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11059 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11060 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11061 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11062 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11063 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11064 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11065 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11066 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11067 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11068}
11069
11070
11071IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11072{
11073 RTUINT64U uSrc1 = { *puDst };
11074 RTUINT64U uSrc2 = { *puSrc };
11075 RTUINT64U uDst;
11076
11077 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11078 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11079 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11080 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11081 *puDst = uDst.u;
11082}
11083
11084
11085IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11086{
11087 RTUINT128U uSrc1 = *puDst;
11088
11089 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11090 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11091 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11092 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11093 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11094 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11095 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11096 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11097}
11098
11099#endif
11100
11101IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11102{
11103 RTUINT128U uSrc1 = *puDst;
11104
11105 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11106 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11107 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11108 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11109 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11110 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11111 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11112 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11113 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11114 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11115 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11116 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11117 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11118 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11119 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11120 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11121}
11122
11123
11124IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11125{
11126 RTUINT128U uSrc1 = *puDst;
11127
11128 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11129 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11130 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11131 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11132 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11133 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11134 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11135 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11136 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11137 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11138 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11139 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11140 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11141 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11142 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11143 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11144}
11145
11146
11147IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11148{
11149 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11150 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11151 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11152 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11153 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11154 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11155 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11156 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11157 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11158 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11159 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11160 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11161 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11162 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11163 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11164 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11165}
11166
11167
11168IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11169{
11170 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11171 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11172 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11173 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11174 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11175 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11176 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11177 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11178 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11179 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11180 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11181 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11182 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11183 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11184 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11185 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11186 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11187 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11188 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11189 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11190 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11191 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11192 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11193 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11194 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11195 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11196 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11197 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11198 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11199 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11200 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11201 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11202}
11203
11204
11205IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11206{
11207 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11208 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11209 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11210 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11211 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11212 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11213 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11214 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11215}
11216
11217
11218IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11219{
11220 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11221 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11222 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11223 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11224 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11225 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11226 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11227 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11228 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11229 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11230 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11231 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11232 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11233 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11234 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11235 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11236}
11237
11238#undef PAVGB_EXEC
11239#undef PAVGW_EXEC
11240
11241
11242/*
11243 * PMOVMSKB / VPMOVMSKB
11244 */
11245#ifdef IEM_WITHOUT_ASSEMBLY
11246
11247IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11248{
11249 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11250 uint64_t const uSrc = *pu64Src;
11251 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11252 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11253 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11254 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11255 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11256 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11257 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11258 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11259}
11260
11261
11262IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11263{
11264 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11265 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11266 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11267 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11268 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11269 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11270 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11271 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11272 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11273 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11274 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11275 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11276 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11277 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11278 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11279 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11280 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11281 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11282 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11283}
11284
11285#endif
11286
11287IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11288{
11289 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11290 uint64_t const uSrc0 = puSrc->QWords.qw0;
11291 uint64_t const uSrc1 = puSrc->QWords.qw1;
11292 uint64_t const uSrc2 = puSrc->QWords.qw2;
11293 uint64_t const uSrc3 = puSrc->QWords.qw3;
11294 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11295 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11296 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11297 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11298 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11299 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11300 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11301 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11302 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11303 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11304 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11305 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11306 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11307 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11308 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11309 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11310 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11311 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11312 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11313 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11314 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11315 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11316 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11317 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11318 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11319 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11320 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11321 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11322 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11323 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11324 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11325 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11326}
11327
11328
11329/*
11330 * [V]PSHUFB
11331 */
11332
11333IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11334{
11335 RTUINT64U const uSrc = { *puSrc };
11336 RTUINT64U const uDstIn = { *puDst };
11337 ASMCompilerBarrier();
11338 RTUINT64U uDstOut = { 0 };
11339 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11340 {
11341 uint8_t idxSrc = uSrc.au8[iByte];
11342 if (!(idxSrc & 0x80))
11343 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11344 }
11345 *puDst = uDstOut.u;
11346 RT_NOREF(pFpuState);
11347}
11348
11349
11350IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11351{
11352 RTUINT128U const uSrc = *puSrc;
11353 RTUINT128U const uDstIn = *puDst;
11354 ASMCompilerBarrier();
11355 puDst->au64[0] = 0;
11356 puDst->au64[1] = 0;
11357 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11358 {
11359 uint8_t idxSrc = uSrc.au8[iByte];
11360 if (!(idxSrc & 0x80))
11361 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11362 }
11363 RT_NOREF(pFpuState);
11364}
11365
11366
11367IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11368 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11369{
11370 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11371 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11372 ASMCompilerBarrier();
11373 puDst->au64[0] = 0;
11374 puDst->au64[1] = 0;
11375 for (unsigned iByte = 0; iByte < 16; iByte++)
11376 {
11377 uint8_t idxSrc = uSrc2.au8[iByte];
11378 if (!(idxSrc & 0x80))
11379 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11380 }
11381 RT_NOREF(pExtState);
11382}
11383
11384
11385IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11386 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11387{
11388 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11389 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11390 ASMCompilerBarrier();
11391 puDst->au64[0] = 0;
11392 puDst->au64[1] = 0;
11393 puDst->au64[2] = 0;
11394 puDst->au64[3] = 0;
11395 for (unsigned iByte = 0; iByte < 16; iByte++)
11396 {
11397 uint8_t idxSrc = uSrc2.au8[iByte];
11398 if (!(idxSrc & 0x80))
11399 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11400 }
11401 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11402 {
11403 uint8_t idxSrc = uSrc2.au8[iByte];
11404 if (!(idxSrc & 0x80))
11405 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11406 }
11407 RT_NOREF(pExtState);
11408}
11409
11410
11411/*
11412 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11413 */
11414#ifdef IEM_WITHOUT_ASSEMBLY
11415
11416IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11417{
11418 uint64_t const uSrc = *puSrc;
11419 ASMCompilerBarrier();
11420 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11421 uSrc >> (((bEvil >> 2) & 3) * 16),
11422 uSrc >> (((bEvil >> 4) & 3) * 16),
11423 uSrc >> (((bEvil >> 6) & 3) * 16));
11424}
11425
11426
11427IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11428{
11429 puDst->QWords.qw0 = puSrc->QWords.qw0;
11430 uint64_t const uSrc = puSrc->QWords.qw1;
11431 ASMCompilerBarrier();
11432 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11433 uSrc >> (((bEvil >> 2) & 3) * 16),
11434 uSrc >> (((bEvil >> 4) & 3) * 16),
11435 uSrc >> (((bEvil >> 6) & 3) * 16));
11436}
11437
11438#endif
11439
11440IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11441{
11442 puDst->QWords.qw0 = puSrc->QWords.qw0;
11443 uint64_t const uSrc1 = puSrc->QWords.qw1;
11444 puDst->QWords.qw2 = puSrc->QWords.qw2;
11445 uint64_t const uSrc3 = puSrc->QWords.qw3;
11446 ASMCompilerBarrier();
11447 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11448 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11449 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11450 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11451 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11452 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11453 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11454 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11455}
11456
11457#ifdef IEM_WITHOUT_ASSEMBLY
11458IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11459{
11460 puDst->QWords.qw1 = puSrc->QWords.qw1;
11461 uint64_t const uSrc = puSrc->QWords.qw0;
11462 ASMCompilerBarrier();
11463 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11464 uSrc >> (((bEvil >> 2) & 3) * 16),
11465 uSrc >> (((bEvil >> 4) & 3) * 16),
11466 uSrc >> (((bEvil >> 6) & 3) * 16));
11467
11468}
11469#endif
11470
11471
11472IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11473{
11474 puDst->QWords.qw3 = puSrc->QWords.qw3;
11475 uint64_t const uSrc2 = puSrc->QWords.qw2;
11476 puDst->QWords.qw1 = puSrc->QWords.qw1;
11477 uint64_t const uSrc0 = puSrc->QWords.qw0;
11478 ASMCompilerBarrier();
11479 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11480 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11481 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11482 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11483 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11484 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11485 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11486 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11487
11488}
11489
11490
11491#ifdef IEM_WITHOUT_ASSEMBLY
11492IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11493{
11494 RTUINT128U const uSrc = *puSrc;
11495 ASMCompilerBarrier();
11496 puDst->au32[0] = uSrc.au32[bEvil & 3];
11497 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11498 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11499 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11500}
11501#endif
11502
11503
11504IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11505{
11506 RTUINT256U const uSrc = *puSrc;
11507 ASMCompilerBarrier();
11508 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11509 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11510 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11511 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11512 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11513 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11514 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11515 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11516}
11517
11518
11519/*
11520 * PUNPCKHBW - high bytes -> words
11521 */
11522#ifdef IEM_WITHOUT_ASSEMBLY
11523
11524IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11525{
11526 RTUINT64U const uSrc2 = { *puSrc };
11527 RTUINT64U const uSrc1 = { *puDst };
11528 ASMCompilerBarrier();
11529 RTUINT64U uDstOut;
11530 uDstOut.au8[0] = uSrc1.au8[4];
11531 uDstOut.au8[1] = uSrc2.au8[4];
11532 uDstOut.au8[2] = uSrc1.au8[5];
11533 uDstOut.au8[3] = uSrc2.au8[5];
11534 uDstOut.au8[4] = uSrc1.au8[6];
11535 uDstOut.au8[5] = uSrc2.au8[6];
11536 uDstOut.au8[6] = uSrc1.au8[7];
11537 uDstOut.au8[7] = uSrc2.au8[7];
11538 *puDst = uDstOut.u;
11539}
11540
11541
11542IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11543{
11544 RTUINT128U const uSrc2 = *puSrc;
11545 RTUINT128U const uSrc1 = *puDst;
11546 ASMCompilerBarrier();
11547 RTUINT128U uDstOut;
11548 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11549 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11550 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11551 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11552 uDstOut.au8[ 4] = uSrc1.au8[10];
11553 uDstOut.au8[ 5] = uSrc2.au8[10];
11554 uDstOut.au8[ 6] = uSrc1.au8[11];
11555 uDstOut.au8[ 7] = uSrc2.au8[11];
11556 uDstOut.au8[ 8] = uSrc1.au8[12];
11557 uDstOut.au8[ 9] = uSrc2.au8[12];
11558 uDstOut.au8[10] = uSrc1.au8[13];
11559 uDstOut.au8[11] = uSrc2.au8[13];
11560 uDstOut.au8[12] = uSrc1.au8[14];
11561 uDstOut.au8[13] = uSrc2.au8[14];
11562 uDstOut.au8[14] = uSrc1.au8[15];
11563 uDstOut.au8[15] = uSrc2.au8[15];
11564 *puDst = uDstOut;
11565}
11566
11567#endif
11568
11569IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11570{
11571 RTUINT128U const uSrc2 = *puSrc2;
11572 RTUINT128U const uSrc1 = *puSrc1;
11573 ASMCompilerBarrier();
11574 RTUINT128U uDstOut;
11575 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11576 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11577 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11578 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11579 uDstOut.au8[ 4] = uSrc1.au8[10];
11580 uDstOut.au8[ 5] = uSrc2.au8[10];
11581 uDstOut.au8[ 6] = uSrc1.au8[11];
11582 uDstOut.au8[ 7] = uSrc2.au8[11];
11583 uDstOut.au8[ 8] = uSrc1.au8[12];
11584 uDstOut.au8[ 9] = uSrc2.au8[12];
11585 uDstOut.au8[10] = uSrc1.au8[13];
11586 uDstOut.au8[11] = uSrc2.au8[13];
11587 uDstOut.au8[12] = uSrc1.au8[14];
11588 uDstOut.au8[13] = uSrc2.au8[14];
11589 uDstOut.au8[14] = uSrc1.au8[15];
11590 uDstOut.au8[15] = uSrc2.au8[15];
11591 *puDst = uDstOut;
11592}
11593
11594
11595IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11596{
11597 RTUINT256U const uSrc2 = *puSrc2;
11598 RTUINT256U const uSrc1 = *puSrc1;
11599 ASMCompilerBarrier();
11600 RTUINT256U uDstOut;
11601 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11602 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11603 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11604 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11605 uDstOut.au8[ 4] = uSrc1.au8[10];
11606 uDstOut.au8[ 5] = uSrc2.au8[10];
11607 uDstOut.au8[ 6] = uSrc1.au8[11];
11608 uDstOut.au8[ 7] = uSrc2.au8[11];
11609 uDstOut.au8[ 8] = uSrc1.au8[12];
11610 uDstOut.au8[ 9] = uSrc2.au8[12];
11611 uDstOut.au8[10] = uSrc1.au8[13];
11612 uDstOut.au8[11] = uSrc2.au8[13];
11613 uDstOut.au8[12] = uSrc1.au8[14];
11614 uDstOut.au8[13] = uSrc2.au8[14];
11615 uDstOut.au8[14] = uSrc1.au8[15];
11616 uDstOut.au8[15] = uSrc2.au8[15];
11617 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11618 uDstOut.au8[16] = uSrc1.au8[24];
11619 uDstOut.au8[17] = uSrc2.au8[24];
11620 uDstOut.au8[18] = uSrc1.au8[25];
11621 uDstOut.au8[19] = uSrc2.au8[25];
11622 uDstOut.au8[20] = uSrc1.au8[26];
11623 uDstOut.au8[21] = uSrc2.au8[26];
11624 uDstOut.au8[22] = uSrc1.au8[27];
11625 uDstOut.au8[23] = uSrc2.au8[27];
11626 uDstOut.au8[24] = uSrc1.au8[28];
11627 uDstOut.au8[25] = uSrc2.au8[28];
11628 uDstOut.au8[26] = uSrc1.au8[29];
11629 uDstOut.au8[27] = uSrc2.au8[29];
11630 uDstOut.au8[28] = uSrc1.au8[30];
11631 uDstOut.au8[29] = uSrc2.au8[30];
11632 uDstOut.au8[30] = uSrc1.au8[31];
11633 uDstOut.au8[31] = uSrc2.au8[31];
11634 *puDst = uDstOut;
11635}
11636
11637
11638/*
11639 * PUNPCKHBW - high words -> dwords
11640 */
11641#ifdef IEM_WITHOUT_ASSEMBLY
11642
11643IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11644{
11645 RTUINT64U const uSrc2 = { *puSrc };
11646 RTUINT64U const uSrc1 = { *puDst };
11647 ASMCompilerBarrier();
11648 RTUINT64U uDstOut;
11649 uDstOut.au16[0] = uSrc1.au16[2];
11650 uDstOut.au16[1] = uSrc2.au16[2];
11651 uDstOut.au16[2] = uSrc1.au16[3];
11652 uDstOut.au16[3] = uSrc2.au16[3];
11653 *puDst = uDstOut.u;
11654}
11655
11656
11657IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11658{
11659 RTUINT128U const uSrc2 = *puSrc;
11660 RTUINT128U const uSrc1 = *puDst;
11661 ASMCompilerBarrier();
11662 RTUINT128U uDstOut;
11663 uDstOut.au16[0] = uSrc1.au16[4];
11664 uDstOut.au16[1] = uSrc2.au16[4];
11665 uDstOut.au16[2] = uSrc1.au16[5];
11666 uDstOut.au16[3] = uSrc2.au16[5];
11667 uDstOut.au16[4] = uSrc1.au16[6];
11668 uDstOut.au16[5] = uSrc2.au16[6];
11669 uDstOut.au16[6] = uSrc1.au16[7];
11670 uDstOut.au16[7] = uSrc2.au16[7];
11671 *puDst = uDstOut;
11672}
11673
11674#endif
11675
11676IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11677{
11678 RTUINT128U const uSrc2 = *puSrc2;
11679 RTUINT128U const uSrc1 = *puSrc1;
11680 ASMCompilerBarrier();
11681 RTUINT128U uDstOut;
11682 uDstOut.au16[0] = uSrc1.au16[4];
11683 uDstOut.au16[1] = uSrc2.au16[4];
11684 uDstOut.au16[2] = uSrc1.au16[5];
11685 uDstOut.au16[3] = uSrc2.au16[5];
11686 uDstOut.au16[4] = uSrc1.au16[6];
11687 uDstOut.au16[5] = uSrc2.au16[6];
11688 uDstOut.au16[6] = uSrc1.au16[7];
11689 uDstOut.au16[7] = uSrc2.au16[7];
11690 *puDst = uDstOut;
11691}
11692
11693
11694IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11695{
11696 RTUINT256U const uSrc2 = *puSrc2;
11697 RTUINT256U const uSrc1 = *puSrc1;
11698 ASMCompilerBarrier();
11699 RTUINT256U uDstOut;
11700 uDstOut.au16[0] = uSrc1.au16[4];
11701 uDstOut.au16[1] = uSrc2.au16[4];
11702 uDstOut.au16[2] = uSrc1.au16[5];
11703 uDstOut.au16[3] = uSrc2.au16[5];
11704 uDstOut.au16[4] = uSrc1.au16[6];
11705 uDstOut.au16[5] = uSrc2.au16[6];
11706 uDstOut.au16[6] = uSrc1.au16[7];
11707 uDstOut.au16[7] = uSrc2.au16[7];
11708
11709 uDstOut.au16[8] = uSrc1.au16[12];
11710 uDstOut.au16[9] = uSrc2.au16[12];
11711 uDstOut.au16[10] = uSrc1.au16[13];
11712 uDstOut.au16[11] = uSrc2.au16[13];
11713 uDstOut.au16[12] = uSrc1.au16[14];
11714 uDstOut.au16[13] = uSrc2.au16[14];
11715 uDstOut.au16[14] = uSrc1.au16[15];
11716 uDstOut.au16[15] = uSrc2.au16[15];
11717 *puDst = uDstOut;
11718}
11719
11720
11721/*
11722 * PUNPCKHBW - high dwords -> qword(s)
11723 */
11724#ifdef IEM_WITHOUT_ASSEMBLY
11725
11726IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11727{
11728 RTUINT64U const uSrc2 = { *puSrc };
11729 RTUINT64U const uSrc1 = { *puDst };
11730 ASMCompilerBarrier();
11731 RTUINT64U uDstOut;
11732 uDstOut.au32[0] = uSrc1.au32[1];
11733 uDstOut.au32[1] = uSrc2.au32[1];
11734 *puDst = uDstOut.u;
11735}
11736
11737
11738IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11739{
11740 RTUINT128U const uSrc2 = *puSrc;
11741 RTUINT128U const uSrc1 = *puDst;
11742 ASMCompilerBarrier();
11743 RTUINT128U uDstOut;
11744 uDstOut.au32[0] = uSrc1.au32[2];
11745 uDstOut.au32[1] = uSrc2.au32[2];
11746 uDstOut.au32[2] = uSrc1.au32[3];
11747 uDstOut.au32[3] = uSrc2.au32[3];
11748 *puDst = uDstOut;
11749}
11750
11751#endif
11752
11753IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11754{
11755 RTUINT128U const uSrc2 = *puSrc2;
11756 RTUINT128U const uSrc1 = *puSrc1;
11757 ASMCompilerBarrier();
11758 RTUINT128U uDstOut;
11759 uDstOut.au32[0] = uSrc1.au32[2];
11760 uDstOut.au32[1] = uSrc2.au32[2];
11761 uDstOut.au32[2] = uSrc1.au32[3];
11762 uDstOut.au32[3] = uSrc2.au32[3];
11763 *puDst = uDstOut;
11764}
11765
11766
11767IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11768{
11769 RTUINT256U const uSrc2 = *puSrc2;
11770 RTUINT256U const uSrc1 = *puSrc1;
11771 ASMCompilerBarrier();
11772 RTUINT256U uDstOut;
11773 uDstOut.au32[0] = uSrc1.au32[2];
11774 uDstOut.au32[1] = uSrc2.au32[2];
11775 uDstOut.au32[2] = uSrc1.au32[3];
11776 uDstOut.au32[3] = uSrc2.au32[3];
11777
11778 uDstOut.au32[4] = uSrc1.au32[6];
11779 uDstOut.au32[5] = uSrc2.au32[6];
11780 uDstOut.au32[6] = uSrc1.au32[7];
11781 uDstOut.au32[7] = uSrc2.au32[7];
11782 *puDst = uDstOut;
11783}
11784
11785
11786/*
11787 * PUNPCKHQDQ -> High qwords -> double qword(s).
11788 */
11789#ifdef IEM_WITHOUT_ASSEMBLY
11790IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11791{
11792 RTUINT128U const uSrc2 = *puSrc;
11793 RTUINT128U const uSrc1 = *puDst;
11794 ASMCompilerBarrier();
11795 RTUINT128U uDstOut;
11796 uDstOut.au64[0] = uSrc1.au64[1];
11797 uDstOut.au64[1] = uSrc2.au64[1];
11798 *puDst = uDstOut;
11799}
11800#endif
11801
11802
11803IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11804{
11805 RTUINT128U const uSrc2 = *puSrc2;
11806 RTUINT128U const uSrc1 = *puSrc1;
11807 ASMCompilerBarrier();
11808 RTUINT128U uDstOut;
11809 uDstOut.au64[0] = uSrc1.au64[1];
11810 uDstOut.au64[1] = uSrc2.au64[1];
11811 *puDst = uDstOut;
11812}
11813
11814
11815IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11816{
11817 RTUINT256U const uSrc2 = *puSrc2;
11818 RTUINT256U const uSrc1 = *puSrc1;
11819 ASMCompilerBarrier();
11820 RTUINT256U uDstOut;
11821 uDstOut.au64[0] = uSrc1.au64[1];
11822 uDstOut.au64[1] = uSrc2.au64[1];
11823
11824 uDstOut.au64[2] = uSrc1.au64[3];
11825 uDstOut.au64[3] = uSrc2.au64[3];
11826 *puDst = uDstOut;
11827}
11828
11829
11830/*
11831 * PUNPCKLBW - low bytes -> words
11832 */
11833#ifdef IEM_WITHOUT_ASSEMBLY
11834
11835IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11836{
11837 RTUINT64U const uSrc2 = { *puSrc };
11838 RTUINT64U const uSrc1 = { *puDst };
11839 ASMCompilerBarrier();
11840 RTUINT64U uDstOut;
11841 uDstOut.au8[0] = uSrc1.au8[0];
11842 uDstOut.au8[1] = uSrc2.au8[0];
11843 uDstOut.au8[2] = uSrc1.au8[1];
11844 uDstOut.au8[3] = uSrc2.au8[1];
11845 uDstOut.au8[4] = uSrc1.au8[2];
11846 uDstOut.au8[5] = uSrc2.au8[2];
11847 uDstOut.au8[6] = uSrc1.au8[3];
11848 uDstOut.au8[7] = uSrc2.au8[3];
11849 *puDst = uDstOut.u;
11850}
11851
11852
11853IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11854{
11855 RTUINT128U const uSrc2 = *puSrc;
11856 RTUINT128U const uSrc1 = *puDst;
11857 ASMCompilerBarrier();
11858 RTUINT128U uDstOut;
11859 uDstOut.au8[ 0] = uSrc1.au8[0];
11860 uDstOut.au8[ 1] = uSrc2.au8[0];
11861 uDstOut.au8[ 2] = uSrc1.au8[1];
11862 uDstOut.au8[ 3] = uSrc2.au8[1];
11863 uDstOut.au8[ 4] = uSrc1.au8[2];
11864 uDstOut.au8[ 5] = uSrc2.au8[2];
11865 uDstOut.au8[ 6] = uSrc1.au8[3];
11866 uDstOut.au8[ 7] = uSrc2.au8[3];
11867 uDstOut.au8[ 8] = uSrc1.au8[4];
11868 uDstOut.au8[ 9] = uSrc2.au8[4];
11869 uDstOut.au8[10] = uSrc1.au8[5];
11870 uDstOut.au8[11] = uSrc2.au8[5];
11871 uDstOut.au8[12] = uSrc1.au8[6];
11872 uDstOut.au8[13] = uSrc2.au8[6];
11873 uDstOut.au8[14] = uSrc1.au8[7];
11874 uDstOut.au8[15] = uSrc2.au8[7];
11875 *puDst = uDstOut;
11876}
11877
11878#endif
11879
11880IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11881{
11882 RTUINT128U const uSrc2 = *puSrc2;
11883 RTUINT128U const uSrc1 = *puSrc1;
11884 ASMCompilerBarrier();
11885 RTUINT128U uDstOut;
11886 uDstOut.au8[ 0] = uSrc1.au8[0];
11887 uDstOut.au8[ 1] = uSrc2.au8[0];
11888 uDstOut.au8[ 2] = uSrc1.au8[1];
11889 uDstOut.au8[ 3] = uSrc2.au8[1];
11890 uDstOut.au8[ 4] = uSrc1.au8[2];
11891 uDstOut.au8[ 5] = uSrc2.au8[2];
11892 uDstOut.au8[ 6] = uSrc1.au8[3];
11893 uDstOut.au8[ 7] = uSrc2.au8[3];
11894 uDstOut.au8[ 8] = uSrc1.au8[4];
11895 uDstOut.au8[ 9] = uSrc2.au8[4];
11896 uDstOut.au8[10] = uSrc1.au8[5];
11897 uDstOut.au8[11] = uSrc2.au8[5];
11898 uDstOut.au8[12] = uSrc1.au8[6];
11899 uDstOut.au8[13] = uSrc2.au8[6];
11900 uDstOut.au8[14] = uSrc1.au8[7];
11901 uDstOut.au8[15] = uSrc2.au8[7];
11902 *puDst = uDstOut;
11903}
11904
11905
11906IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11907{
11908 RTUINT256U const uSrc2 = *puSrc2;
11909 RTUINT256U const uSrc1 = *puSrc1;
11910 ASMCompilerBarrier();
11911 RTUINT256U uDstOut;
11912 uDstOut.au8[ 0] = uSrc1.au8[0];
11913 uDstOut.au8[ 1] = uSrc2.au8[0];
11914 uDstOut.au8[ 2] = uSrc1.au8[1];
11915 uDstOut.au8[ 3] = uSrc2.au8[1];
11916 uDstOut.au8[ 4] = uSrc1.au8[2];
11917 uDstOut.au8[ 5] = uSrc2.au8[2];
11918 uDstOut.au8[ 6] = uSrc1.au8[3];
11919 uDstOut.au8[ 7] = uSrc2.au8[3];
11920 uDstOut.au8[ 8] = uSrc1.au8[4];
11921 uDstOut.au8[ 9] = uSrc2.au8[4];
11922 uDstOut.au8[10] = uSrc1.au8[5];
11923 uDstOut.au8[11] = uSrc2.au8[5];
11924 uDstOut.au8[12] = uSrc1.au8[6];
11925 uDstOut.au8[13] = uSrc2.au8[6];
11926 uDstOut.au8[14] = uSrc1.au8[7];
11927 uDstOut.au8[15] = uSrc2.au8[7];
11928 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11929 uDstOut.au8[16] = uSrc1.au8[16];
11930 uDstOut.au8[17] = uSrc2.au8[16];
11931 uDstOut.au8[18] = uSrc1.au8[17];
11932 uDstOut.au8[19] = uSrc2.au8[17];
11933 uDstOut.au8[20] = uSrc1.au8[18];
11934 uDstOut.au8[21] = uSrc2.au8[18];
11935 uDstOut.au8[22] = uSrc1.au8[19];
11936 uDstOut.au8[23] = uSrc2.au8[19];
11937 uDstOut.au8[24] = uSrc1.au8[20];
11938 uDstOut.au8[25] = uSrc2.au8[20];
11939 uDstOut.au8[26] = uSrc1.au8[21];
11940 uDstOut.au8[27] = uSrc2.au8[21];
11941 uDstOut.au8[28] = uSrc1.au8[22];
11942 uDstOut.au8[29] = uSrc2.au8[22];
11943 uDstOut.au8[30] = uSrc1.au8[23];
11944 uDstOut.au8[31] = uSrc2.au8[23];
11945 *puDst = uDstOut;
11946}
11947
11948
11949/*
11950 * PUNPCKLBW - low words -> dwords
11951 */
11952#ifdef IEM_WITHOUT_ASSEMBLY
11953
11954IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11955{
11956 RTUINT64U const uSrc2 = { *puSrc };
11957 RTUINT64U const uSrc1 = { *puDst };
11958 ASMCompilerBarrier();
11959 RTUINT64U uDstOut;
11960 uDstOut.au16[0] = uSrc1.au16[0];
11961 uDstOut.au16[1] = uSrc2.au16[0];
11962 uDstOut.au16[2] = uSrc1.au16[1];
11963 uDstOut.au16[3] = uSrc2.au16[1];
11964 *puDst = uDstOut.u;
11965}
11966
11967
11968IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11969{
11970 RTUINT128U const uSrc2 = *puSrc;
11971 RTUINT128U const uSrc1 = *puDst;
11972 ASMCompilerBarrier();
11973 RTUINT128U uDstOut;
11974 uDstOut.au16[0] = uSrc1.au16[0];
11975 uDstOut.au16[1] = uSrc2.au16[0];
11976 uDstOut.au16[2] = uSrc1.au16[1];
11977 uDstOut.au16[3] = uSrc2.au16[1];
11978 uDstOut.au16[4] = uSrc1.au16[2];
11979 uDstOut.au16[5] = uSrc2.au16[2];
11980 uDstOut.au16[6] = uSrc1.au16[3];
11981 uDstOut.au16[7] = uSrc2.au16[3];
11982 *puDst = uDstOut;
11983}
11984
11985#endif
11986
11987IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11988{
11989 RTUINT128U const uSrc2 = *puSrc2;
11990 RTUINT128U const uSrc1 = *puSrc1;
11991 ASMCompilerBarrier();
11992 RTUINT128U uDstOut;
11993 uDstOut.au16[0] = uSrc1.au16[0];
11994 uDstOut.au16[1] = uSrc2.au16[0];
11995 uDstOut.au16[2] = uSrc1.au16[1];
11996 uDstOut.au16[3] = uSrc2.au16[1];
11997 uDstOut.au16[4] = uSrc1.au16[2];
11998 uDstOut.au16[5] = uSrc2.au16[2];
11999 uDstOut.au16[6] = uSrc1.au16[3];
12000 uDstOut.au16[7] = uSrc2.au16[3];
12001 *puDst = uDstOut;
12002}
12003
12004
12005IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12006{
12007 RTUINT256U const uSrc2 = *puSrc2;
12008 RTUINT256U const uSrc1 = *puSrc1;
12009 ASMCompilerBarrier();
12010 RTUINT256U uDstOut;
12011 uDstOut.au16[0] = uSrc1.au16[0];
12012 uDstOut.au16[1] = uSrc2.au16[0];
12013 uDstOut.au16[2] = uSrc1.au16[1];
12014 uDstOut.au16[3] = uSrc2.au16[1];
12015 uDstOut.au16[4] = uSrc1.au16[2];
12016 uDstOut.au16[5] = uSrc2.au16[2];
12017 uDstOut.au16[6] = uSrc1.au16[3];
12018 uDstOut.au16[7] = uSrc2.au16[3];
12019
12020 uDstOut.au16[8] = uSrc1.au16[8];
12021 uDstOut.au16[9] = uSrc2.au16[8];
12022 uDstOut.au16[10] = uSrc1.au16[9];
12023 uDstOut.au16[11] = uSrc2.au16[9];
12024 uDstOut.au16[12] = uSrc1.au16[10];
12025 uDstOut.au16[13] = uSrc2.au16[10];
12026 uDstOut.au16[14] = uSrc1.au16[11];
12027 uDstOut.au16[15] = uSrc2.au16[11];
12028 *puDst = uDstOut;
12029}
12030
12031
12032/*
12033 * PUNPCKLBW - low dwords -> qword(s)
12034 */
12035#ifdef IEM_WITHOUT_ASSEMBLY
12036
12037IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12038{
12039 RTUINT64U const uSrc2 = { *puSrc };
12040 RTUINT64U const uSrc1 = { *puDst };
12041 ASMCompilerBarrier();
12042 RTUINT64U uDstOut;
12043 uDstOut.au32[0] = uSrc1.au32[0];
12044 uDstOut.au32[1] = uSrc2.au32[0];
12045 *puDst = uDstOut.u;
12046}
12047
12048
12049IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12050{
12051 RTUINT128U const uSrc2 = *puSrc;
12052 RTUINT128U const uSrc1 = *puDst;
12053 ASMCompilerBarrier();
12054 RTUINT128U uDstOut;
12055 uDstOut.au32[0] = uSrc1.au32[0];
12056 uDstOut.au32[1] = uSrc2.au32[0];
12057 uDstOut.au32[2] = uSrc1.au32[1];
12058 uDstOut.au32[3] = uSrc2.au32[1];
12059 *puDst = uDstOut;
12060}
12061
12062#endif
12063
12064IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12065{
12066 RTUINT128U const uSrc2 = *puSrc2;
12067 RTUINT128U const uSrc1 = *puSrc1;
12068 ASMCompilerBarrier();
12069 RTUINT128U uDstOut;
12070 uDstOut.au32[0] = uSrc1.au32[0];
12071 uDstOut.au32[1] = uSrc2.au32[0];
12072 uDstOut.au32[2] = uSrc1.au32[1];
12073 uDstOut.au32[3] = uSrc2.au32[1];
12074 *puDst = uDstOut;
12075}
12076
12077
12078IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12079{
12080 RTUINT256U const uSrc2 = *puSrc2;
12081 RTUINT256U const uSrc1 = *puSrc1;
12082 ASMCompilerBarrier();
12083 RTUINT256U uDstOut;
12084 uDstOut.au32[0] = uSrc1.au32[0];
12085 uDstOut.au32[1] = uSrc2.au32[0];
12086 uDstOut.au32[2] = uSrc1.au32[1];
12087 uDstOut.au32[3] = uSrc2.au32[1];
12088
12089 uDstOut.au32[4] = uSrc1.au32[4];
12090 uDstOut.au32[5] = uSrc2.au32[4];
12091 uDstOut.au32[6] = uSrc1.au32[5];
12092 uDstOut.au32[7] = uSrc2.au32[5];
12093 *puDst = uDstOut;
12094}
12095
12096
12097/*
12098 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12099 */
12100#ifdef IEM_WITHOUT_ASSEMBLY
12101IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12102{
12103 RTUINT128U const uSrc2 = *puSrc;
12104 RTUINT128U const uSrc1 = *puDst;
12105 ASMCompilerBarrier();
12106 RTUINT128U uDstOut;
12107 uDstOut.au64[0] = uSrc1.au64[0];
12108 uDstOut.au64[1] = uSrc2.au64[0];
12109 *puDst = uDstOut;
12110}
12111#endif
12112
12113
12114IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12115{
12116 RTUINT128U const uSrc2 = *puSrc2;
12117 RTUINT128U const uSrc1 = *puSrc1;
12118 ASMCompilerBarrier();
12119 RTUINT128U uDstOut;
12120 uDstOut.au64[0] = uSrc1.au64[0];
12121 uDstOut.au64[1] = uSrc2.au64[0];
12122 *puDst = uDstOut;
12123}
12124
12125
12126IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12127{
12128 RTUINT256U const uSrc2 = *puSrc2;
12129 RTUINT256U const uSrc1 = *puSrc1;
12130 ASMCompilerBarrier();
12131 RTUINT256U uDstOut;
12132 uDstOut.au64[0] = uSrc1.au64[0];
12133 uDstOut.au64[1] = uSrc2.au64[0];
12134
12135 uDstOut.au64[2] = uSrc1.au64[2];
12136 uDstOut.au64[3] = uSrc2.au64[2];
12137 *puDst = uDstOut;
12138}
12139
12140
12141/*
12142 * PACKSSWB - signed words -> signed bytes
12143 */
12144
12145#ifdef IEM_WITHOUT_ASSEMBLY
12146
12147IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12148{
12149 RTUINT64U const uSrc2 = { *puSrc };
12150 RTUINT64U const uSrc1 = { *puDst };
12151 ASMCompilerBarrier();
12152 RTUINT64U uDstOut;
12153 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12154 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12155 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12156 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12157 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12158 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12159 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12160 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12161 *puDst = uDstOut.u;
12162}
12163
12164
12165IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12166{
12167 RTUINT128U const uSrc2 = *puSrc;
12168 RTUINT128U const uSrc1 = *puDst;
12169 ASMCompilerBarrier();
12170 RTUINT128U uDstOut;
12171 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12172 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12173 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12174 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12175 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12176 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12177 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12178 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12179 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12180 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12181 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12182 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12183 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12184 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12185 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12186 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12187 *puDst = uDstOut;
12188}
12189
12190#endif
12191
12192IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12193{
12194 RTUINT128U const uSrc2 = *puSrc2;
12195 RTUINT128U const uSrc1 = *puSrc1;
12196 ASMCompilerBarrier();
12197 RTUINT128U uDstOut;
12198 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12199 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12200 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12201 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12202 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12203 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12204 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12205 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12206 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12207 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12208 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12209 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12210 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12211 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12212 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12213 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12214 *puDst = uDstOut;
12215}
12216
12217
12218IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12219{
12220 RTUINT256U const uSrc2 = *puSrc2;
12221 RTUINT256U const uSrc1 = *puSrc1;
12222 ASMCompilerBarrier();
12223 RTUINT256U uDstOut;
12224 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12225 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12226 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12227 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12228 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12229 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12230 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12231 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12232 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12233 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12234 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12235 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12236 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12237 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12238 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12239 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12240
12241 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12242 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12243 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12244 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12245 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12246 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12247 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12248 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12249 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12250 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12251 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12252 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12253 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12254 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12255 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12256 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12257 *puDst = uDstOut;
12258}
12259
12260
12261/*
12262 * PACKUSWB - signed words -> unsigned bytes
12263 */
12264#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12265 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12266 ? (uint8_t)(a_iWord) \
12267 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12268
12269#ifdef IEM_WITHOUT_ASSEMBLY
12270
12271IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12272{
12273 RTUINT64U const uSrc2 = { *puSrc };
12274 RTUINT64U const uSrc1 = { *puDst };
12275 ASMCompilerBarrier();
12276 RTUINT64U uDstOut;
12277 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12278 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12279 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12280 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12281 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12282 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12283 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12284 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12285 *puDst = uDstOut.u;
12286}
12287
12288
12289IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12290{
12291 RTUINT128U const uSrc2 = *puSrc;
12292 RTUINT128U const uSrc1 = *puDst;
12293 ASMCompilerBarrier();
12294 RTUINT128U uDstOut;
12295 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12296 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12297 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12298 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12299 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12300 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12301 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12302 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12303 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12304 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12305 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12306 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12307 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12308 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12309 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12310 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12311 *puDst = uDstOut;
12312}
12313
12314#endif
12315
12316IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12317{
12318 RTUINT128U const uSrc2 = *puSrc2;
12319 RTUINT128U const uSrc1 = *puSrc1;
12320 ASMCompilerBarrier();
12321 RTUINT128U uDstOut;
12322 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12323 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12324 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12325 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12326 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12327 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12328 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12329 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12330 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12331 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12332 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12333 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12334 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12335 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12336 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12337 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12338 *puDst = uDstOut;
12339}
12340
12341
12342IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12343{
12344 RTUINT256U const uSrc2 = *puSrc2;
12345 RTUINT256U const uSrc1 = *puSrc1;
12346 ASMCompilerBarrier();
12347 RTUINT256U uDstOut;
12348 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12349 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12350 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12351 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12352 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12353 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12354 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12355 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12356 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12357 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12358 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12359 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12360 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12361 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12362 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12363 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12364
12365 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12366 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12367 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12368 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12369 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12370 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12371 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12372 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12373 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12374 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12375 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12376 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12377 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12378 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12379 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12380 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12381 *puDst = uDstOut;
12382}
12383
12384
12385/*
12386 * PACKSSDW - signed dwords -> signed words
12387 */
12388
12389#ifdef IEM_WITHOUT_ASSEMBLY
12390
12391IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12392{
12393 RTUINT64U const uSrc2 = { *puSrc };
12394 RTUINT64U const uSrc1 = { *puDst };
12395 ASMCompilerBarrier();
12396 RTUINT64U uDstOut;
12397 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12398 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12399 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12400 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12401 *puDst = uDstOut.u;
12402}
12403
12404
12405IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12406{
12407 RTUINT128U const uSrc2 = *puSrc;
12408 RTUINT128U const uSrc1 = *puDst;
12409 ASMCompilerBarrier();
12410 RTUINT128U uDstOut;
12411 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12412 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12413 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12414 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12415 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12416 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12417 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12418 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12419 *puDst = uDstOut;
12420}
12421
12422#endif
12423
12424IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12425{
12426 RTUINT128U const uSrc2 = *puSrc2;
12427 RTUINT128U const uSrc1 = *puSrc1;
12428 ASMCompilerBarrier();
12429 RTUINT128U uDstOut;
12430 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12431 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12432 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12433 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12434 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12435 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12436 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12437 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12438 *puDst = uDstOut;
12439}
12440
12441
12442IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12443{
12444 RTUINT256U const uSrc2 = *puSrc2;
12445 RTUINT256U const uSrc1 = *puSrc1;
12446 ASMCompilerBarrier();
12447 RTUINT256U uDstOut;
12448 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12449 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12450 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12451 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12452 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12453 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12454 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12455 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12456
12457 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12458 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12459 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12460 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12461 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12462 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12463 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12464 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12465 *puDst = uDstOut;
12466}
12467
12468
12469/*
12470 * PACKUSDW - signed dwords -> unsigned words
12471 */
12472#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12473 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12474 ? (uint16_t)(a_iDword) \
12475 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12476
12477#ifdef IEM_WITHOUT_ASSEMBLY
12478IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12479{
12480 RTUINT128U const uSrc2 = *puSrc;
12481 RTUINT128U const uSrc1 = *puDst;
12482 ASMCompilerBarrier();
12483 RTUINT128U uDstOut;
12484 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12485 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12486 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12487 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12488 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12489 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12490 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12491 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12492 *puDst = uDstOut;
12493}
12494#endif
12495
12496IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12497{
12498 RTUINT128U const uSrc2 = *puSrc2;
12499 RTUINT128U const uSrc1 = *puSrc1;
12500 ASMCompilerBarrier();
12501 RTUINT128U uDstOut;
12502 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12503 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12504 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12505 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12506 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12507 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12508 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12509 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12510 *puDst = uDstOut;
12511}
12512
12513
12514IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12515{
12516 RTUINT256U const uSrc2 = *puSrc2;
12517 RTUINT256U const uSrc1 = *puSrc1;
12518 ASMCompilerBarrier();
12519 RTUINT256U uDstOut;
12520 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12521 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12522 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12523 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12524 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12525 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12526 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12527 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12528
12529 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12530 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12531 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12532 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12533 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12534 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12535 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12536 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12537 *puDst = uDstOut;
12538}
12539
12540
12541/*
12542 * [V]PABSB / [V]PABSW / [V]PABSD
12543 */
12544
12545IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12546{
12547 RTUINT64U const uSrc = { *puSrc };
12548 RTUINT64U uDstOut = { 0 };
12549
12550 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12551 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12552 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12553 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12554 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12555 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12556 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12557 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12558 *puDst = uDstOut.u;
12559 RT_NOREF(pFpuState);
12560}
12561
12562
12563IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12564{
12565 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12566 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12567 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12568 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12569 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12570 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12571 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12572 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12573 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12574 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12575 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12576 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12577 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12578 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12579 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12580 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12581 RT_NOREF(pFpuState);
12582}
12583
12584
12585IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12586{
12587 RTUINT64U const uSrc = { *puSrc };
12588 RTUINT64U uDstOut = { 0 };
12589
12590 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12591 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12592 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12593 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12594 *puDst = uDstOut.u;
12595 RT_NOREF(pFpuState);
12596}
12597
12598
12599IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12600{
12601 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12602 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12603 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12604 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12605 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12606 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12607 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12608 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12609 RT_NOREF(pFpuState);
12610}
12611
12612
12613IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12614{
12615 RTUINT64U const uSrc = { *puSrc };
12616 RTUINT64U uDstOut = { 0 };
12617
12618 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12619 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12620 *puDst = uDstOut.u;
12621 RT_NOREF(pFpuState);
12622}
12623
12624
12625IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12626{
12627 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12628 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12629 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12630 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12631 RT_NOREF(pFpuState);
12632}
12633
12634
12635IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12636{
12637 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12638 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12639 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12640 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12641 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12642 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12643 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12644 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12645 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12646 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12647 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12648 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12649 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12650 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12651 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12652 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12653}
12654
12655
12656IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12657{
12658 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12659 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12660 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12661 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12662 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12663 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12664 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12665 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12666 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12667 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12668 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12669 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12670 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12671 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12672 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12673 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12674 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12675 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12676 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12677 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12678 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12679 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12680 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12681 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12682 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12683 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12684 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12685 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12686 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12687 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12688 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12689 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12690}
12691
12692
12693IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12694{
12695 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12696 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12697 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12698 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12699 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12700 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12701 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12702 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12703}
12704
12705
12706IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12707{
12708 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12709 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12710 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12711 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12712 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12713 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12714 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12715 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12716 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12717 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12718 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12719 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12720 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12721 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12722 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12723 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12724}
12725
12726
12727IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12728{
12729 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12730 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12731 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12732 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12733}
12734
12735
12736IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12737{
12738 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12739 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12740 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12741 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12742 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12743 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12744 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12745 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12746}
12747
12748
12749/*
12750 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12751 */
12752IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12753{
12754 RTUINT64U uSrc1 = { *puDst };
12755 RTUINT64U uSrc2 = { *puSrc };
12756 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12757
12758 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12759 {
12760 if (uSrc2.ai8[i] < 0)
12761 uDst.ai8[i] = -uSrc1.ai8[i];
12762 else if (uSrc2.ai8[i] == 0)
12763 uDst.ai8[i] = 0;
12764 else /* uSrc2.ai8[i] > 0 */
12765 uDst.ai8[i] = uSrc1.ai8[i];
12766 }
12767
12768 *puDst = uDst.u;
12769 RT_NOREF(pFpuState);
12770}
12771
12772
12773IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12774{
12775 RTUINT128U uSrc1 = *puDst;
12776
12777 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12778 {
12779 if (puSrc->ai8[i] < 0)
12780 puDst->ai8[i] = -uSrc1.ai8[i];
12781 else if (puSrc->ai8[i] == 0)
12782 puDst->ai8[i] = 0;
12783 else /* puSrc->ai8[i] > 0 */
12784 puDst->ai8[i] = uSrc1.ai8[i];
12785 }
12786
12787 RT_NOREF(pFpuState);
12788}
12789
12790
12791IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12792{
12793 RTUINT64U uSrc1 = { *puDst };
12794 RTUINT64U uSrc2 = { *puSrc };
12795 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12796
12797 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12798 {
12799 if (uSrc2.ai16[i] < 0)
12800 uDst.ai16[i] = -uSrc1.ai16[i];
12801 else if (uSrc2.ai16[i] == 0)
12802 uDst.ai16[i] = 0;
12803 else /* uSrc2.ai16[i] > 0 */
12804 uDst.ai16[i] = uSrc1.ai16[i];
12805 }
12806
12807 *puDst = uDst.u;
12808 RT_NOREF(pFpuState);
12809}
12810
12811
12812IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12813{
12814 RTUINT128U uSrc1 = *puDst;
12815
12816 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12817 {
12818 if (puSrc->ai16[i] < 0)
12819 puDst->ai16[i] = -uSrc1.ai16[i];
12820 else if (puSrc->ai16[i] == 0)
12821 puDst->ai16[i] = 0;
12822 else /* puSrc->ai16[i] > 0 */
12823 puDst->ai16[i] = uSrc1.ai16[i];
12824 }
12825
12826 RT_NOREF(pFpuState);
12827}
12828
12829
12830IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12831{
12832 RTUINT64U uSrc1 = { *puDst };
12833 RTUINT64U uSrc2 = { *puSrc };
12834 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12835
12836 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12837 {
12838 if (uSrc2.ai32[i] < 0)
12839 uDst.ai32[i] = -uSrc1.ai32[i];
12840 else if (uSrc2.ai32[i] == 0)
12841 uDst.ai32[i] = 0;
12842 else /* uSrc2.ai32[i] > 0 */
12843 uDst.ai32[i] = uSrc1.ai32[i];
12844 }
12845
12846 *puDst = uDst.u;
12847 RT_NOREF(pFpuState);
12848}
12849
12850
12851IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12852{
12853 RTUINT128U uSrc1 = *puDst;
12854
12855 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12856 {
12857 if (puSrc->ai32[i] < 0)
12858 puDst->ai32[i] = -uSrc1.ai32[i];
12859 else if (puSrc->ai32[i] == 0)
12860 puDst->ai32[i] = 0;
12861 else /* puSrc->ai32[i] > 0 */
12862 puDst->ai32[i] = uSrc1.ai32[i];
12863 }
12864
12865 RT_NOREF(pFpuState);
12866}
12867
12868
12869IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12870{
12871 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12872 {
12873 if (puSrc2->ai8[i] < 0)
12874 puDst->ai8[i] = -puSrc1->ai8[i];
12875 else if (puSrc2->ai8[i] == 0)
12876 puDst->ai8[i] = 0;
12877 else /* puSrc2->ai8[i] > 0 */
12878 puDst->ai8[i] = puSrc1->ai8[i];
12879 }
12880}
12881
12882
12883IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12884{
12885 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12886 {
12887 if (puSrc2->ai8[i] < 0)
12888 puDst->ai8[i] = -puSrc1->ai8[i];
12889 else if (puSrc2->ai8[i] == 0)
12890 puDst->ai8[i] = 0;
12891 else /* puSrc2->ai8[i] > 0 */
12892 puDst->ai8[i] = puSrc1->ai8[i];
12893 }
12894}
12895
12896
12897IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12898{
12899 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12900 {
12901 if (puSrc2->ai16[i] < 0)
12902 puDst->ai16[i] = -puSrc1->ai16[i];
12903 else if (puSrc2->ai16[i] == 0)
12904 puDst->ai16[i] = 0;
12905 else /* puSrc2->ai16[i] > 0 */
12906 puDst->ai16[i] = puSrc1->ai16[i];
12907 }
12908}
12909
12910
12911IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12912{
12913 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12914 {
12915 if (puSrc2->ai16[i] < 0)
12916 puDst->ai16[i] = -puSrc1->ai16[i];
12917 else if (puSrc2->ai16[i] == 0)
12918 puDst->ai16[i] = 0;
12919 else /* puSrc2->ai16[i] > 0 */
12920 puDst->ai16[i] = puSrc1->ai16[i];
12921 }
12922}
12923
12924
12925IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12926{
12927 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12928 {
12929 if (puSrc2->ai32[i] < 0)
12930 puDst->ai32[i] = -puSrc1->ai32[i];
12931 else if (puSrc2->ai32[i] == 0)
12932 puDst->ai32[i] = 0;
12933 else /* puSrc2->ai32[i] > 0 */
12934 puDst->ai32[i] = puSrc1->ai32[i];
12935 }
12936}
12937
12938
12939IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12940{
12941 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12942 {
12943 if (puSrc2->ai32[i] < 0)
12944 puDst->ai32[i] = -puSrc1->ai32[i];
12945 else if (puSrc2->ai32[i] == 0)
12946 puDst->ai32[i] = 0;
12947 else /* puSrc2->ai32[i] > 0 */
12948 puDst->ai32[i] = puSrc1->ai32[i];
12949 }
12950}
12951
12952
12953/*
12954 * PHADDW / VPHADDW / PHADDD / VPHADDD
12955 */
12956IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12957{
12958 RTUINT64U uSrc1 = { *puDst };
12959 RTUINT64U uSrc2 = { *puSrc };
12960 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12961
12962 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12963 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12964 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
12965 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
12966 *puDst = uDst.u;
12967 RT_NOREF(pFpuState);
12968}
12969
12970
12971IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12972{
12973 RTUINT128U uSrc1 = *puDst;
12974
12975 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12976 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12977 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
12978 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
12979
12980 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
12981 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
12982 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
12983 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
12984 RT_NOREF(pFpuState);
12985}
12986
12987
12988IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12989{
12990 RTUINT64U uSrc1 = { *puDst };
12991 RTUINT64U uSrc2 = { *puSrc };
12992 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12993
12994 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12995 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
12996 *puDst = uDst.u;
12997 RT_NOREF(pFpuState);
12998}
12999
13000
13001IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13002{
13003 RTUINT128U uSrc1 = *puDst;
13004
13005 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13006 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13007
13008 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13009 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13010 RT_NOREF(pFpuState);
13011}
13012
13013
13014IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13015{
13016 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13017
13018 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13019 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13020 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13021 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13022
13023 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13024 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13025 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13026 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13027
13028 puDst->au64[0] = uDst.au64[0];
13029 puDst->au64[1] = uDst.au64[1];
13030}
13031
13032
13033IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13034{
13035 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13036
13037 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13038 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13039 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13040 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13041 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13042 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13043 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13044 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13045
13046 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13047 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13048 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13049 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13050 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13051 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13052 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13053 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13054
13055 puDst->au64[0] = uDst.au64[0];
13056 puDst->au64[1] = uDst.au64[1];
13057 puDst->au64[2] = uDst.au64[2];
13058 puDst->au64[3] = uDst.au64[3];
13059}
13060
13061
13062IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13063{
13064 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13065
13066 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13067 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13068
13069 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13070 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13071
13072 puDst->au64[0] = uDst.au64[0];
13073 puDst->au64[1] = uDst.au64[1];
13074}
13075
13076
13077IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13078{
13079 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13080
13081 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13082 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13083 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13084 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13085
13086 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13087 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13088 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13089 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13090
13091 puDst->au64[0] = uDst.au64[0];
13092 puDst->au64[1] = uDst.au64[1];
13093 puDst->au64[2] = uDst.au64[2];
13094 puDst->au64[3] = uDst.au64[3];
13095}
13096
13097
13098/*
13099 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13100 */
13101IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13102{
13103 RTUINT64U uSrc1 = { *puDst };
13104 RTUINT64U uSrc2 = { *puSrc };
13105 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13106
13107 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13108 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13109 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13110 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13111 *puDst = uDst.u;
13112 RT_NOREF(pFpuState);
13113}
13114
13115
13116IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13117{
13118 RTUINT128U uSrc1 = *puDst;
13119
13120 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13121 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13122 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13123 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13124
13125 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13126 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13127 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13128 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13129 RT_NOREF(pFpuState);
13130}
13131
13132
13133IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13134{
13135 RTUINT64U uSrc1 = { *puDst };
13136 RTUINT64U uSrc2 = { *puSrc };
13137 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13138
13139 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13140 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13141 *puDst = uDst.u;
13142 RT_NOREF(pFpuState);
13143}
13144
13145
13146IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13147{
13148 RTUINT128U uSrc1 = *puDst;
13149
13150 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13151 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13152
13153 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13154 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13155 RT_NOREF(pFpuState);
13156}
13157
13158
13159IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13160{
13161 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13162
13163 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13164 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13165 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13166 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13167
13168 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13169 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13170 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13171 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13172
13173 puDst->au64[0] = uDst.au64[0];
13174 puDst->au64[1] = uDst.au64[1];
13175}
13176
13177
13178IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13179{
13180 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13181
13182 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13183 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13184 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13185 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13186 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13187 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13188 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13189 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13190
13191 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13192 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13193 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13194 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13195 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13196 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13197 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13198 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13199
13200 puDst->au64[0] = uDst.au64[0];
13201 puDst->au64[1] = uDst.au64[1];
13202 puDst->au64[2] = uDst.au64[2];
13203 puDst->au64[3] = uDst.au64[3];
13204}
13205
13206
13207IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13208{
13209 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13210
13211 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13212 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13213
13214 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13215 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13216
13217 puDst->au64[0] = uDst.au64[0];
13218 puDst->au64[1] = uDst.au64[1];
13219}
13220
13221
13222IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13223{
13224 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13225
13226 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13227 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13228 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13229 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13230
13231 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13232 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13233 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13234 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13235
13236 puDst->au64[0] = uDst.au64[0];
13237 puDst->au64[1] = uDst.au64[1];
13238 puDst->au64[2] = uDst.au64[2];
13239 puDst->au64[3] = uDst.au64[3];
13240}
13241
13242
13243/*
13244 * PHADDSW / VPHADDSW
13245 */
13246IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13247{
13248 RTUINT64U uSrc1 = { *puDst };
13249 RTUINT64U uSrc2 = { *puSrc };
13250 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13251
13252 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13253 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13254 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13255 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13256 *puDst = uDst.u;
13257 RT_NOREF(pFpuState);
13258}
13259
13260
13261IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13262{
13263 RTUINT128U uSrc1 = *puDst;
13264
13265 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13266 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13267 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13268 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13269
13270 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13271 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13272 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13273 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13274 RT_NOREF(pFpuState);
13275}
13276
13277
13278IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13279{
13280 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13281
13282 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13283 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13284 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13285 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13286
13287 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13288 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13289 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13290 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13291
13292 puDst->au64[0] = uDst.au64[0];
13293 puDst->au64[1] = uDst.au64[1];
13294}
13295
13296
13297IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13298{
13299 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13300
13301 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13302 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13303 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13304 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13305 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13306 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13307 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13308 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13309
13310 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13311 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13312 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13313 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13314 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13315 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13316 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13317 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13318
13319 puDst->au64[0] = uDst.au64[0];
13320 puDst->au64[1] = uDst.au64[1];
13321 puDst->au64[2] = uDst.au64[2];
13322 puDst->au64[3] = uDst.au64[3];
13323}
13324
13325
13326/*
13327 * PHSUBSW / VPHSUBSW
13328 */
13329IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13330{
13331 RTUINT64U uSrc1 = { *puDst };
13332 RTUINT64U uSrc2 = { *puSrc };
13333 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13334
13335 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13336 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13337 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13338 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13339 *puDst = uDst.u;
13340 RT_NOREF(pFpuState);
13341}
13342
13343
13344IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13345{
13346 RTUINT128U uSrc1 = *puDst;
13347
13348 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13349 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13350 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13351 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13352
13353 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13354 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13355 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13356 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13357 RT_NOREF(pFpuState);
13358}
13359
13360
13361IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13362{
13363 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13364
13365 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13366 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13367 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13368 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13369
13370 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13371 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13372 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13373 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13374
13375 puDst->au64[0] = uDst.au64[0];
13376 puDst->au64[1] = uDst.au64[1];
13377}
13378
13379
13380IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13381{
13382 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13383
13384 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13385 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13386 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13387 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13388 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13389 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13390 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13391 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13392
13393 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13394 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13395 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13396 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13397 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13398 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13399 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13400 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13401
13402 puDst->au64[0] = uDst.au64[0];
13403 puDst->au64[1] = uDst.au64[1];
13404 puDst->au64[2] = uDst.au64[2];
13405 puDst->au64[3] = uDst.au64[3];
13406}
13407
13408
13409/*
13410 * PMADDUBSW / VPMADDUBSW
13411 */
13412IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13413{
13414 RTUINT64U uSrc1 = { *puDst };
13415 RTUINT64U uSrc2 = { *puSrc };
13416 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13417
13418 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13419 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13420 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13421 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13422 *puDst = uDst.u;
13423 RT_NOREF(pFpuState);
13424}
13425
13426
13427IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13428{
13429 RTUINT128U uSrc1 = *puDst;
13430
13431 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13432 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13433 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13434 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13435 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13436 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13437 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13438 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13439 RT_NOREF(pFpuState);
13440}
13441
13442
13443IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13444{
13445 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13446
13447 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13448 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13449 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13450 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13451 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13452 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13453 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13454 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13455
13456 puDst->au64[0] = uDst.au64[0];
13457 puDst->au64[1] = uDst.au64[1];
13458}
13459
13460
13461IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13462{
13463 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13464
13465 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13466 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13467 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13468 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13469 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13470 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13471 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13472 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13473 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13474 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13475 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13476 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13477 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13478 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13479 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13480 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13481
13482 puDst->au64[0] = uDst.au64[0];
13483 puDst->au64[1] = uDst.au64[1];
13484 puDst->au64[2] = uDst.au64[2];
13485 puDst->au64[3] = uDst.au64[3];
13486}
13487
13488
13489/*
13490 * PMULHRSW / VPMULHRSW
13491 */
13492#define DO_PMULHRSW(a_Src1, a_Src2) \
13493 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13494
13495IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13496{
13497 RTUINT64U uSrc1 = { *puDst };
13498 RTUINT64U uSrc2 = { *puSrc };
13499 RTUINT64U uDst;
13500
13501 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13502 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13503 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13504 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13505 *puDst = uDst.u;
13506 RT_NOREF(pFpuState);
13507}
13508
13509
13510IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13511{
13512 RTUINT128U uSrc1 = *puDst;
13513
13514 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13515 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13516 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13517 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13518 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13519 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13520 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13521 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13522 RT_NOREF(pFpuState);
13523}
13524
13525
13526IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13527{
13528 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13529
13530 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13531 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13532 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13533 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13534 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13535 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13536 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13537 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13538
13539 puDst->au64[0] = uDst.au64[0];
13540 puDst->au64[1] = uDst.au64[1];
13541}
13542
13543
13544IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13545{
13546 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13547
13548 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13549 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13550 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13551 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13552 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13553 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13554 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13555 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13556 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13557 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13558 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13559 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13560 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13561 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13562 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13563 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13564
13565 puDst->au64[0] = uDst.au64[0];
13566 puDst->au64[1] = uDst.au64[1];
13567 puDst->au64[2] = uDst.au64[2];
13568 puDst->au64[3] = uDst.au64[3];
13569}
13570
13571
13572/*
13573 * PSADBW / VPSADBW
13574 */
13575#ifdef IEM_WITHOUT_ASSEMBLY
13576
13577IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13578{
13579 RTUINT64U uSrc1 = { *puDst };
13580 RTUINT64U uSrc2 = { *puSrc };
13581 RTUINT64U uDst;
13582 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13583 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13584 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13585 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13586 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13587 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13588 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13589 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13590
13591 uDst.au64[0] = 0;
13592 uDst.au16[0] = uSum;
13593 *puDst = uDst.u;
13594}
13595
13596
13597IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13598{
13599 RTUINT128U uSrc1 = *puDst;
13600
13601 puDst->au64[0] = 0;
13602 puDst->au64[1] = 0;
13603
13604 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13605 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13606 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13607 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13608 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13609 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13610 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13611 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13612 puDst->au16[0] = uSum;
13613
13614 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13615 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13616 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13617 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13618 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13619 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13620 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13621 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13622 puDst->au16[4] = uSum;
13623}
13624
13625#endif
13626
13627IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13628{
13629 RTUINT128U uSrc1 = *puSrc1;
13630 RTUINT128U uSrc2 = *puSrc2;
13631
13632 puDst->au64[0] = 0;
13633 puDst->au64[1] = 0;
13634
13635 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13636 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13637 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13638 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13639 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13640 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13641 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13642 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13643 puDst->au16[0] = uSum;
13644
13645 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13646 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13647 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13648 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13649 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13650 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13651 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13652 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13653 puDst->au16[4] = uSum;
13654}
13655
13656IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13657{
13658 RTUINT256U uSrc1 = *puSrc1;
13659 RTUINT256U uSrc2 = *puSrc2;
13660
13661 puDst->au64[0] = 0;
13662 puDst->au64[1] = 0;
13663 puDst->au64[2] = 0;
13664 puDst->au64[3] = 0;
13665
13666 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13667 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13668 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13669 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13670 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13671 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13672 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13673 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13674 puDst->au16[0] = uSum;
13675
13676 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13677 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13678 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13679 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13680 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13681 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13682 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13683 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13684 puDst->au16[4] = uSum;
13685
13686 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13687 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13688 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13689 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13690 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13691 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13692 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13693 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13694 puDst->au16[8] = uSum;
13695
13696 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13697 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13698 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13699 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13700 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13701 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13702 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13703 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13704 puDst->au16[12] = uSum;
13705}
13706
13707
13708/*
13709 * PMULDQ / VPMULDQ
13710 */
13711IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13712{
13713 RTUINT128U uSrc1 = *puDst;
13714
13715 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13716 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13717}
13718
13719IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13720{
13721 RTUINT128U uSrc1 = *puSrc1;
13722 RTUINT128U uSrc2 = *puSrc2;
13723
13724 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13725 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13726}
13727
13728IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13729{
13730 RTUINT256U uSrc1 = *puSrc1;
13731 RTUINT256U uSrc2 = *puSrc2;
13732
13733 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13734 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13735 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13736 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13737}
13738
13739
13740/*
13741 * PMULUDQ / VPMULUDQ
13742 */
13743#ifdef IEM_WITHOUT_ASSEMBLY
13744
13745IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13746{
13747 RTUINT64U uSrc1 = { *puDst };
13748 RTUINT64U uSrc2 = { *puSrc };
13749 ASMCompilerBarrier();
13750 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13751 RT_NOREF(pFpuState);
13752}
13753
13754
13755IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13756{
13757 RTUINT128U uSrc1 = *puDst;
13758 RTUINT128U uSrc2 = *puSrc;
13759 ASMCompilerBarrier();
13760 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13761 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13762 RT_NOREF(pFpuState);
13763}
13764
13765#endif
13766
13767IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13768{
13769 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13770 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13771 ASMCompilerBarrier();
13772 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13773 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13774}
13775
13776
13777IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13778{
13779 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13780 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13781 ASMCompilerBarrier();
13782 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13783 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13784 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13785 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13786}
13787
13788
13789/*
13790 * UNPCKLPS / VUNPCKLPS
13791 */
13792#ifdef IEM_WITHOUT_ASSEMBLY
13793IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13794{
13795 RTUINT128U uSrc1 = *puDst;
13796 RTUINT128U uSrc2 = *puSrc;
13797 ASMCompilerBarrier();
13798 puDst->au32[0] = uSrc1.au32[0];
13799 puDst->au32[1] = uSrc2.au32[0];
13800 puDst->au32[2] = uSrc1.au32[1];
13801 puDst->au32[3] = uSrc2.au32[1];
13802}
13803
13804#endif
13805
13806IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13807{
13808 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13809 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13810 ASMCompilerBarrier();
13811 puDst->au32[0] = uSrc1.au32[0];
13812 puDst->au32[1] = uSrc2.au32[0];
13813 puDst->au32[2] = uSrc1.au32[1];
13814 puDst->au32[3] = uSrc2.au32[1];
13815}
13816
13817
13818IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13819{
13820 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13821 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13822 ASMCompilerBarrier();
13823 puDst->au32[0] = uSrc1.au32[0];
13824 puDst->au32[1] = uSrc2.au32[0];
13825 puDst->au32[2] = uSrc1.au32[1];
13826 puDst->au32[3] = uSrc2.au32[1];
13827
13828 puDst->au32[4] = uSrc1.au32[4];
13829 puDst->au32[5] = uSrc2.au32[4];
13830 puDst->au32[6] = uSrc1.au32[5];
13831 puDst->au32[7] = uSrc2.au32[5];
13832}
13833
13834
13835/*
13836 * UNPCKLPD / VUNPCKLPD
13837 */
13838#ifdef IEM_WITHOUT_ASSEMBLY
13839IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13840{
13841 RTUINT128U uSrc1 = *puDst;
13842 RTUINT128U uSrc2 = *puSrc;
13843 ASMCompilerBarrier();
13844 puDst->au64[0] = uSrc1.au64[0];
13845 puDst->au64[1] = uSrc2.au64[0];
13846}
13847
13848#endif
13849
13850IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13851{
13852 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13853 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13854 ASMCompilerBarrier();
13855 puDst->au64[0] = uSrc1.au64[0];
13856 puDst->au64[1] = uSrc2.au64[0];
13857}
13858
13859
13860IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13861{
13862 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13863 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13864 ASMCompilerBarrier();
13865 puDst->au64[0] = uSrc1.au64[0];
13866 puDst->au64[1] = uSrc2.au64[0];
13867 puDst->au64[2] = uSrc1.au64[2];
13868 puDst->au64[3] = uSrc2.au64[2];
13869}
13870
13871
13872/*
13873 * UNPCKHPS / VUNPCKHPS
13874 */
13875#ifdef IEM_WITHOUT_ASSEMBLY
13876IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13877{
13878 RTUINT128U uSrc1 = *puDst;
13879 RTUINT128U uSrc2 = *puSrc;
13880 ASMCompilerBarrier();
13881 puDst->au32[0] = uSrc1.au32[2];
13882 puDst->au32[1] = uSrc2.au32[2];
13883 puDst->au32[2] = uSrc1.au32[3];
13884 puDst->au32[3] = uSrc2.au32[3];
13885}
13886
13887#endif
13888
13889IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13890{
13891 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13892 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13893 ASMCompilerBarrier();
13894 puDst->au32[0] = uSrc1.au32[2];
13895 puDst->au32[1] = uSrc2.au32[2];
13896 puDst->au32[2] = uSrc1.au32[3];
13897 puDst->au32[3] = uSrc2.au32[3];
13898}
13899
13900
13901IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13902{
13903 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13904 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13905 ASMCompilerBarrier();
13906 puDst->au32[0] = uSrc1.au32[2];
13907 puDst->au32[1] = uSrc2.au32[2];
13908 puDst->au32[2] = uSrc1.au32[3];
13909 puDst->au32[3] = uSrc2.au32[3];
13910
13911 puDst->au32[4] = uSrc1.au32[6];
13912 puDst->au32[5] = uSrc2.au32[6];
13913 puDst->au32[6] = uSrc1.au32[7];
13914 puDst->au32[7] = uSrc2.au32[7];
13915}
13916
13917
13918/*
13919 * UNPCKHPD / VUNPCKHPD
13920 */
13921#ifdef IEM_WITHOUT_ASSEMBLY
13922IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13923{
13924 RTUINT128U uSrc1 = *puDst;
13925 RTUINT128U uSrc2 = *puSrc;
13926 ASMCompilerBarrier();
13927 puDst->au64[0] = uSrc1.au64[1];
13928 puDst->au64[1] = uSrc2.au64[1];
13929}
13930
13931#endif
13932
13933IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13934{
13935 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13936 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13937 ASMCompilerBarrier();
13938 puDst->au64[0] = uSrc1.au64[1];
13939 puDst->au64[1] = uSrc2.au64[1];
13940}
13941
13942
13943IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13944{
13945 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13946 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13947 ASMCompilerBarrier();
13948 puDst->au64[0] = uSrc1.au64[1];
13949 puDst->au64[1] = uSrc2.au64[1];
13950 puDst->au64[2] = uSrc1.au64[3];
13951 puDst->au64[3] = uSrc2.au64[3];
13952}
13953
13954
13955/*
13956 * CRC32 (SEE 4.2).
13957 */
13958
13959IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
13960{
13961 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13962}
13963
13964
13965IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
13966{
13967 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13968}
13969
13970IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
13971{
13972 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13973}
13974
13975IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
13976{
13977 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13978}
13979
13980
13981/*
13982 * PTEST (SSE 4.1) - special as it output only EFLAGS.
13983 */
13984#ifdef IEM_WITHOUT_ASSEMBLY
13985IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
13986{
13987 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13988 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13989 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13990 fEfl |= X86_EFL_ZF;
13991 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13992 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13993 fEfl |= X86_EFL_CF;
13994 *pfEFlags = fEfl;
13995}
13996#endif
13997
13998IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
13999{
14000 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14001 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14002 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14003 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14004 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14005 fEfl |= X86_EFL_ZF;
14006 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14007 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14008 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14009 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14010 fEfl |= X86_EFL_CF;
14011 *pfEFlags = fEfl;
14012}
14013
14014
14015/*
14016 * PMOVSXBW / VPMOVSXBW
14017 */
14018IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14019{
14020 RTUINT64U uSrc1 = { uSrc };
14021 puDst->ai16[0] = uSrc1.ai8[0];
14022 puDst->ai16[1] = uSrc1.ai8[1];
14023 puDst->ai16[2] = uSrc1.ai8[2];
14024 puDst->ai16[3] = uSrc1.ai8[3];
14025 puDst->ai16[4] = uSrc1.ai8[4];
14026 puDst->ai16[5] = uSrc1.ai8[5];
14027 puDst->ai16[6] = uSrc1.ai8[6];
14028 puDst->ai16[7] = uSrc1.ai8[7];
14029}
14030
14031
14032IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14033{
14034 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14035 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14036 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14037 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14038 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14039 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14040 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14041 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14042 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14043 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14044 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14045 puDst->ai16[10] = uSrc1.ai8[10];
14046 puDst->ai16[11] = uSrc1.ai8[11];
14047 puDst->ai16[12] = uSrc1.ai8[12];
14048 puDst->ai16[13] = uSrc1.ai8[13];
14049 puDst->ai16[14] = uSrc1.ai8[14];
14050 puDst->ai16[15] = uSrc1.ai8[15];
14051}
14052
14053
14054/*
14055 * PMOVSXBD / VPMOVSXBD
14056 */
14057IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14058{
14059 RTUINT32U uSrc1 = { uSrc };
14060 puDst->ai32[0] = uSrc1.ai8[0];
14061 puDst->ai32[1] = uSrc1.ai8[1];
14062 puDst->ai32[2] = uSrc1.ai8[2];
14063 puDst->ai32[3] = uSrc1.ai8[3];
14064}
14065
14066
14067IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14068{
14069 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14070 puDst->ai32[0] = uSrc1.ai8[0];
14071 puDst->ai32[1] = uSrc1.ai8[1];
14072 puDst->ai32[2] = uSrc1.ai8[2];
14073 puDst->ai32[3] = uSrc1.ai8[3];
14074 puDst->ai32[4] = uSrc1.ai8[4];
14075 puDst->ai32[5] = uSrc1.ai8[5];
14076 puDst->ai32[6] = uSrc1.ai8[6];
14077 puDst->ai32[7] = uSrc1.ai8[7];
14078}
14079
14080
14081/*
14082 * PMOVSXBQ / VPMOVSXBQ
14083 */
14084IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14085{
14086 RTUINT16U uSrc1 = { uSrc };
14087 puDst->ai64[0] = uSrc1.ai8[0];
14088 puDst->ai64[1] = uSrc1.ai8[1];
14089}
14090
14091
14092IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14093{
14094 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14095 puDst->ai64[0] = uSrc1.ai8[0];
14096 puDst->ai64[1] = uSrc1.ai8[1];
14097 puDst->ai64[2] = uSrc1.ai8[2];
14098 puDst->ai64[3] = uSrc1.ai8[3];
14099}
14100
14101
14102/*
14103 * PMOVSXWD / VPMOVSXWD
14104 */
14105IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14106{
14107 RTUINT64U uSrc1 = { uSrc };
14108 puDst->ai32[0] = uSrc1.ai16[0];
14109 puDst->ai32[1] = uSrc1.ai16[1];
14110 puDst->ai32[2] = uSrc1.ai16[2];
14111 puDst->ai32[3] = uSrc1.ai16[3];
14112}
14113
14114
14115IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14116{
14117 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14118 puDst->ai32[0] = uSrc1.ai16[0];
14119 puDst->ai32[1] = uSrc1.ai16[1];
14120 puDst->ai32[2] = uSrc1.ai16[2];
14121 puDst->ai32[3] = uSrc1.ai16[3];
14122 puDst->ai32[4] = uSrc1.ai16[4];
14123 puDst->ai32[5] = uSrc1.ai16[5];
14124 puDst->ai32[6] = uSrc1.ai16[6];
14125 puDst->ai32[7] = uSrc1.ai16[7];
14126}
14127
14128
14129/*
14130 * PMOVSXWQ / VPMOVSXWQ
14131 */
14132IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14133{
14134 RTUINT32U uSrc1 = { uSrc };
14135 puDst->ai64[0] = uSrc1.ai16[0];
14136 puDst->ai64[1] = uSrc1.ai16[1];
14137}
14138
14139
14140IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14141{
14142 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14143 puDst->ai64[0] = uSrc1.ai16[0];
14144 puDst->ai64[1] = uSrc1.ai16[1];
14145 puDst->ai64[2] = uSrc1.ai16[2];
14146 puDst->ai64[3] = uSrc1.ai16[3];
14147}
14148
14149
14150/*
14151 * PMOVSXDQ / VPMOVSXDQ
14152 */
14153IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14154{
14155 RTUINT64U uSrc1 = { uSrc };
14156 puDst->ai64[0] = uSrc1.ai32[0];
14157 puDst->ai64[1] = uSrc1.ai32[1];
14158}
14159
14160
14161IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14162{
14163 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14164 puDst->ai64[0] = uSrc1.ai32[0];
14165 puDst->ai64[1] = uSrc1.ai32[1];
14166 puDst->ai64[2] = uSrc1.ai32[2];
14167 puDst->ai64[3] = uSrc1.ai32[3];
14168}
14169
14170
14171/*
14172 * PMOVZXBW / VPMOVZXBW
14173 */
14174IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14175{
14176 RTUINT64U uSrc1 = { uSrc };
14177 puDst->au16[0] = uSrc1.au8[0];
14178 puDst->au16[1] = uSrc1.au8[1];
14179 puDst->au16[2] = uSrc1.au8[2];
14180 puDst->au16[3] = uSrc1.au8[3];
14181 puDst->au16[4] = uSrc1.au8[4];
14182 puDst->au16[5] = uSrc1.au8[5];
14183 puDst->au16[6] = uSrc1.au8[6];
14184 puDst->au16[7] = uSrc1.au8[7];
14185}
14186
14187
14188IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14189{
14190 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14191 puDst->au16[ 0] = uSrc1.au8[ 0];
14192 puDst->au16[ 1] = uSrc1.au8[ 1];
14193 puDst->au16[ 2] = uSrc1.au8[ 2];
14194 puDst->au16[ 3] = uSrc1.au8[ 3];
14195 puDst->au16[ 4] = uSrc1.au8[ 4];
14196 puDst->au16[ 5] = uSrc1.au8[ 5];
14197 puDst->au16[ 6] = uSrc1.au8[ 6];
14198 puDst->au16[ 7] = uSrc1.au8[ 7];
14199 puDst->au16[ 8] = uSrc1.au8[ 8];
14200 puDst->au16[ 9] = uSrc1.au8[ 9];
14201 puDst->au16[10] = uSrc1.au8[10];
14202 puDst->au16[11] = uSrc1.au8[11];
14203 puDst->au16[12] = uSrc1.au8[12];
14204 puDst->au16[13] = uSrc1.au8[13];
14205 puDst->au16[14] = uSrc1.au8[14];
14206 puDst->au16[15] = uSrc1.au8[15];
14207}
14208
14209
14210/*
14211 * PMOVZXBD / VPMOVZXBD
14212 */
14213IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14214{
14215 RTUINT32U uSrc1 = { uSrc };
14216 puDst->au32[0] = uSrc1.au8[0];
14217 puDst->au32[1] = uSrc1.au8[1];
14218 puDst->au32[2] = uSrc1.au8[2];
14219 puDst->au32[3] = uSrc1.au8[3];
14220}
14221
14222
14223IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14224{
14225 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14226 puDst->au32[0] = uSrc1.au8[0];
14227 puDst->au32[1] = uSrc1.au8[1];
14228 puDst->au32[2] = uSrc1.au8[2];
14229 puDst->au32[3] = uSrc1.au8[3];
14230 puDst->au32[4] = uSrc1.au8[4];
14231 puDst->au32[5] = uSrc1.au8[5];
14232 puDst->au32[6] = uSrc1.au8[6];
14233 puDst->au32[7] = uSrc1.au8[7];
14234}
14235
14236
14237/*
14238 * PMOVZXBQ / VPMOVZXBQ
14239 */
14240IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14241{
14242 RTUINT16U uSrc1 = { uSrc };
14243 puDst->au64[0] = uSrc1.au8[0];
14244 puDst->au64[1] = uSrc1.au8[1];
14245}
14246
14247
14248IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14249{
14250 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14251 puDst->au64[0] = uSrc1.au8[0];
14252 puDst->au64[1] = uSrc1.au8[1];
14253 puDst->au64[2] = uSrc1.au8[2];
14254 puDst->au64[3] = uSrc1.au8[3];
14255}
14256
14257
14258/*
14259 * PMOVZXWD / VPMOVZXWD
14260 */
14261IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14262{
14263 RTUINT64U uSrc1 = { uSrc };
14264 puDst->au32[0] = uSrc1.au16[0];
14265 puDst->au32[1] = uSrc1.au16[1];
14266 puDst->au32[2] = uSrc1.au16[2];
14267 puDst->au32[3] = uSrc1.au16[3];
14268}
14269
14270
14271IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14272{
14273 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14274 puDst->au32[0] = uSrc1.au16[0];
14275 puDst->au32[1] = uSrc1.au16[1];
14276 puDst->au32[2] = uSrc1.au16[2];
14277 puDst->au32[3] = uSrc1.au16[3];
14278 puDst->au32[4] = uSrc1.au16[4];
14279 puDst->au32[5] = uSrc1.au16[5];
14280 puDst->au32[6] = uSrc1.au16[6];
14281 puDst->au32[7] = uSrc1.au16[7];
14282}
14283
14284
14285/*
14286 * PMOVZXWQ / VPMOVZXWQ
14287 */
14288IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14289{
14290 RTUINT32U uSrc1 = { uSrc };
14291 puDst->au64[0] = uSrc1.au16[0];
14292 puDst->au64[1] = uSrc1.au16[1];
14293}
14294
14295
14296IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14297{
14298 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14299 puDst->au64[0] = uSrc1.au16[0];
14300 puDst->au64[1] = uSrc1.au16[1];
14301 puDst->au64[2] = uSrc1.au16[2];
14302 puDst->au64[3] = uSrc1.au16[3];
14303}
14304
14305
14306/*
14307 * PMOVZXDQ / VPMOVZXDQ
14308 */
14309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14310{
14311 RTUINT64U uSrc1 = { uSrc };
14312 puDst->au64[0] = uSrc1.au32[0];
14313 puDst->au64[1] = uSrc1.au32[1];
14314}
14315
14316
14317IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14318{
14319 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14320 puDst->au64[0] = uSrc1.au32[0];
14321 puDst->au64[1] = uSrc1.au32[1];
14322 puDst->au64[2] = uSrc1.au32[2];
14323 puDst->au64[3] = uSrc1.au32[3];
14324}
14325
14326
14327#ifdef IEM_WITHOUT_ASSEMBLY
14328/**
14329 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14330 * the SoftFloat 32-bit floating point format (float32_t).
14331 *
14332 * This is only a structure format conversion, nothing else.
14333 */
14334DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14335{
14336 float32_t Tmp;
14337 Tmp.v = pr32Val->u;
14338 return Tmp;
14339}
14340
14341
14342/**
14343 * Converts from SoftFloat 32-bit floating point format (float32_t)
14344 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14345 *
14346 * This is only a structure format conversion, nothing else.
14347 */
14348DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14349{
14350 pr32Dst->u = r32XSrc.v;
14351 return pr32Dst;
14352}
14353
14354
14355/**
14356 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14357 * the SoftFloat 64-bit floating point format (float64_t).
14358 *
14359 * This is only a structure format conversion, nothing else.
14360 */
14361DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14362{
14363 float64_t Tmp;
14364 Tmp.v = pr64Val->u;
14365 return Tmp;
14366}
14367
14368
14369/**
14370 * Converts from SoftFloat 64-bit floating point format (float64_t)
14371 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14372 *
14373 * This is only a structure format conversion, nothing else.
14374 */
14375DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14376{
14377 pr64Dst->u = r64XSrc.v;
14378 return pr64Dst;
14379}
14380
14381
14382/** Initializer for the SoftFloat state structure. */
14383# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14384 { \
14385 softfloat_tininess_afterRounding, \
14386 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14387 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14388 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14389 : (uint8_t)softfloat_round_minMag, \
14390 0, \
14391 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14392 32 /* Rounding precision, not relevant for SIMD. */ \
14393 }
14394
14395
14396/**
14397 * Helper for transfering exception to MXCSR and setting the result value
14398 * accordingly.
14399 *
14400 * @returns Updated MXCSR.
14401 * @param pSoftState The SoftFloat state following the operation.
14402 * @param r32Result The result of the SoftFloat operation.
14403 * @param pr32Result Where to store the result for IEM.
14404 * @param fMxcsr The original MXCSR value.
14405 */
14406DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14407 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14408{
14409 iemFpSoftF32ToIprt(pr32Result, r32Result);
14410
14411 uint8_t fXcpt = pSoftState->exceptionFlags;
14412 if ( (fMxcsr & X86_MXCSR_FZ)
14413 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14414 {
14415 /* Underflow masked and flush to zero is set. */
14416 pr32Result->s.uFraction = 0;
14417 pr32Result->s.uExponent = 0;
14418 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14419 }
14420
14421 /* If DAZ is set \#DE is never set. */
14422 if ( fMxcsr & X86_MXCSR_DAZ
14423 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14424 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14425 fXcpt &= ~X86_MXCSR_DE;
14426
14427 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14428}
14429
14430
14431/**
14432 * Helper for transfering exception to MXCSR and setting the result value
14433 * accordingly - ignores Flush-to-Zero.
14434 *
14435 * @returns Updated MXCSR.
14436 * @param pSoftState The SoftFloat state following the operation.
14437 * @param r32Result The result of the SoftFloat operation.
14438 * @param pr32Result Where to store the result for IEM.
14439 * @param fMxcsr The original MXCSR value.
14440 */
14441DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14442 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14443{
14444 iemFpSoftF32ToIprt(pr32Result, r32Result);
14445
14446 uint8_t fXcpt = pSoftState->exceptionFlags;
14447 /* If DAZ is set \#DE is never set. */
14448 if ( fMxcsr & X86_MXCSR_DAZ
14449 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14450 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14451 fXcpt &= ~X86_MXCSR_DE;
14452
14453 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14454}
14455
14456
14457/**
14458 * Helper for transfering exception to MXCSR and setting the result value
14459 * accordingly.
14460 *
14461 * @returns Updated MXCSR.
14462 * @param pSoftState The SoftFloat state following the operation.
14463 * @param r64Result The result of the SoftFloat operation.
14464 * @param pr64Result Where to store the result for IEM.
14465 * @param fMxcsr The original MXCSR value.
14466 */
14467DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14468 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14469{
14470 iemFpSoftF64ToIprt(pr64Result, r64Result);
14471 uint8_t fXcpt = pSoftState->exceptionFlags;
14472 if ( (fMxcsr & X86_MXCSR_FZ)
14473 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14474 {
14475 /* Underflow masked and flush to zero is set. */
14476 iemFpSoftF64ToIprt(pr64Result, r64Result);
14477 pr64Result->s.uFractionHigh = 0;
14478 pr64Result->s.uFractionLow = 0;
14479 pr64Result->s.uExponent = 0;
14480 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14481 }
14482
14483 /* If DAZ is set \#DE is never set. */
14484 if ( fMxcsr & X86_MXCSR_DAZ
14485 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14486 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14487 fXcpt &= ~X86_MXCSR_DE;
14488
14489 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14490}
14491
14492
14493/**
14494 * Helper for transfering exception to MXCSR and setting the result value
14495 * accordingly - ignores Flush-to-Zero.
14496 *
14497 * @returns Updated MXCSR.
14498 * @param pSoftState The SoftFloat state following the operation.
14499 * @param r64Result The result of the SoftFloat operation.
14500 * @param pr64Result Where to store the result for IEM.
14501 * @param fMxcsr The original MXCSR value.
14502 */
14503DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14504 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14505{
14506 iemFpSoftF64ToIprt(pr64Result, r64Result);
14507
14508 uint8_t fXcpt = pSoftState->exceptionFlags;
14509 /* If DAZ is set \#DE is never set. */
14510 if ( fMxcsr & X86_MXCSR_DAZ
14511 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14512 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14513 fXcpt &= ~X86_MXCSR_DE;
14514
14515 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14516}
14517
14518
14519/**
14520 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14521 * in MXCSR into account.
14522 *
14523 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14524 * @param pr32Val Where to store the result.
14525 * @param fMxcsr The input MXCSR value.
14526 * @param pr32Src The value to use.
14527 */
14528DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14529{
14530 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14531 {
14532 if (fMxcsr & X86_MXCSR_DAZ)
14533 {
14534 /* De-normals are changed to 0. */
14535 pr32Val->s.fSign = pr32Src->s.fSign;
14536 pr32Val->s.uFraction = 0;
14537 pr32Val->s.uExponent = 0;
14538 return 0;
14539 }
14540
14541 *pr32Val = *pr32Src;
14542 return X86_MXCSR_DE;
14543 }
14544
14545 *pr32Val = *pr32Src;
14546 return 0;
14547}
14548
14549
14550/**
14551 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14552 * in MXCSR into account.
14553 *
14554 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14555 * @param pr64Val Where to store the result.
14556 * @param fMxcsr The input MXCSR value.
14557 * @param pr64Src The value to use.
14558 */
14559DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14560{
14561 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14562 {
14563 if (fMxcsr & X86_MXCSR_DAZ)
14564 {
14565 /* De-normals are changed to 0. */
14566 pr64Val->s64.fSign = pr64Src->s.fSign;
14567 pr64Val->s64.uFraction = 0;
14568 pr64Val->s64.uExponent = 0;
14569 return 0;
14570 }
14571
14572 *pr64Val = *pr64Src;
14573 return X86_MXCSR_DE;
14574 }
14575
14576 *pr64Val = *pr64Src;
14577 return 0;
14578}
14579
14580
14581/**
14582 * Validates the given input operands returning whether the operation can continue or whether one
14583 * of the source operands contains a NaN value, setting the output accordingly.
14584 *
14585 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14586 * @param pr32Res Where to store the result in case the operation can't continue.
14587 * @param pr32Val1 The first input operand.
14588 * @param pr32Val2 The second input operand.
14589 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14590 */
14591DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14592{
14593 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14594 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14595 if (cSNan + cQNan == 2)
14596 {
14597 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14598 *pr32Res = *pr32Val1;
14599 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14600 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14601 return true;
14602 }
14603 else if (cSNan)
14604 {
14605 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14606 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14607 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14608 *pfMxcsr |= X86_MXCSR_IE;
14609 return true;
14610 }
14611 else if (cQNan)
14612 {
14613 /* The QNan operand is placed into the result. */
14614 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14615 return true;
14616 }
14617
14618 Assert(!cQNan && !cSNan);
14619 return false;
14620}
14621
14622
14623/**
14624 * Validates the given double precision input operands returning whether the operation can continue or whether one
14625 * of the source operands contains a NaN value, setting the output accordingly.
14626 *
14627 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14628 * @param pr64Res Where to store the result in case the operation can't continue.
14629 * @param pr64Val1 The first input operand.
14630 * @param pr64Val2 The second input operand.
14631 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14632 */
14633DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14634{
14635 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14636 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14637 if (cSNan + cQNan == 2)
14638 {
14639 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14640 *pr64Res = *pr64Val1;
14641 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14642 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14643 return true;
14644 }
14645 else if (cSNan)
14646 {
14647 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14648 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14649 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14650 *pfMxcsr |= X86_MXCSR_IE;
14651 return true;
14652 }
14653 else if (cQNan)
14654 {
14655 /* The QNan operand is placed into the result. */
14656 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14657 return true;
14658 }
14659
14660 Assert(!cQNan && !cSNan);
14661 return false;
14662}
14663
14664
14665/**
14666 * Validates the given single input operand returning whether the operation can continue or whether
14667 * contains a NaN value, setting the output accordingly.
14668 *
14669 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14670 * @param pr32Res Where to store the result in case the operation can't continue.
14671 * @param pr32Val The input operand.
14672 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14673 */
14674DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14675{
14676 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14677 {
14678 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14679 *pr32Res = *pr32Val;
14680 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14681 *pfMxcsr |= X86_MXCSR_IE;
14682 return true;
14683 }
14684 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14685 {
14686 /* The QNan operand is placed into the result. */
14687 *pr32Res = *pr32Val;
14688 return true;
14689 }
14690
14691 return false;
14692}
14693
14694
14695/**
14696 * Validates the given double input operand returning whether the operation can continue or whether
14697 * contains a NaN value, setting the output accordingly.
14698 *
14699 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14700 * @param pr64Res Where to store the result in case the operation can't continue.
14701 * @param pr64Val The input operand.
14702 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14703 */
14704DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14705{
14706 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14707 {
14708 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14709 *pr64Res = *pr64Val;
14710 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14711 *pfMxcsr |= X86_MXCSR_IE;
14712 return true;
14713 }
14714 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14715 {
14716 /* The QNan operand is placed into the result. */
14717 *pr64Res = *pr64Val;
14718 return true;
14719 }
14720
14721 return false;
14722}
14723#endif
14724
14725
14726/**
14727 * ADDPS
14728 */
14729#ifdef IEM_WITHOUT_ASSEMBLY
14730static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14731{
14732 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14733 return fMxcsr;
14734
14735 RTFLOAT32U r32Src1, r32Src2;
14736 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14737 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14738 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14739 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14740 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14741}
14742
14743
14744IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14745{
14746 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14747 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14748 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14749 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14750}
14751#endif
14752
14753
14754/**
14755 * ADDSS
14756 */
14757#ifdef IEM_WITHOUT_ASSEMBLY
14758IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14759{
14760 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14761 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14762 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14763 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14764}
14765#endif
14766
14767
14768/**
14769 * ADDPD
14770 */
14771#ifdef IEM_WITHOUT_ASSEMBLY
14772static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14773{
14774 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14775 return fMxcsr;
14776
14777 RTFLOAT64U r64Src1, r64Src2;
14778 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14779 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14780 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14781 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14782 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14783}
14784
14785
14786IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14787{
14788 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14789 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14790}
14791#endif
14792
14793
14794/**
14795 * ADDSD
14796 */
14797#ifdef IEM_WITHOUT_ASSEMBLY
14798IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14799{
14800 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14801 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14802}
14803#endif
14804
14805
14806/**
14807 * MULPS
14808 */
14809#ifdef IEM_WITHOUT_ASSEMBLY
14810static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14811{
14812 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14813 return fMxcsr;
14814
14815 RTFLOAT32U r32Src1, r32Src2;
14816 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14817 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14818 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14819 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14820 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14821}
14822
14823
14824IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14825{
14826 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14827 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14828 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14829 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14830}
14831#endif
14832
14833
14834/**
14835 * MULSS
14836 */
14837#ifdef IEM_WITHOUT_ASSEMBLY
14838IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14839{
14840 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14841 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14842 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14843 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14844}
14845#endif
14846
14847
14848/**
14849 * MULPD
14850 */
14851#ifdef IEM_WITHOUT_ASSEMBLY
14852static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14853{
14854 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14855 return fMxcsr;
14856
14857 RTFLOAT64U r64Src1, r64Src2;
14858 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14859 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14861 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14862 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14863}
14864
14865
14866IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14867{
14868 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14869 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14870}
14871#endif
14872
14873
14874/**
14875 * MULSD
14876 */
14877#ifdef IEM_WITHOUT_ASSEMBLY
14878IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14879{
14880 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14881 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14882}
14883#endif
14884
14885
14886/**
14887 * SUBPS
14888 */
14889#ifdef IEM_WITHOUT_ASSEMBLY
14890static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14891{
14892 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14893 return fMxcsr;
14894
14895 RTFLOAT32U r32Src1, r32Src2;
14896 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14897 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14898 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14899 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14900 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14901}
14902
14903
14904IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14905{
14906 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14907 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14908 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14909 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14910}
14911#endif
14912
14913
14914/**
14915 * SUBSS
14916 */
14917#ifdef IEM_WITHOUT_ASSEMBLY
14918IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14919{
14920 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14921 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14922 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14923 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14924}
14925#endif
14926
14927
14928/**
14929 * SUBPD
14930 */
14931#ifdef IEM_WITHOUT_ASSEMBLY
14932static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14933{
14934 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14935 return fMxcsr;
14936
14937 RTFLOAT64U r64Src1, r64Src2;
14938 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14939 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14940 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14941 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14942 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14943}
14944
14945
14946IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14947{
14948 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14949 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14950}
14951#endif
14952
14953
14954/**
14955 * SUBSD
14956 */
14957#ifdef IEM_WITHOUT_ASSEMBLY
14958IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14959{
14960 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14961 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14962}
14963#endif
14964
14965
14966/**
14967 * MINPS
14968 */
14969#ifdef IEM_WITHOUT_ASSEMBLY
14970static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14971{
14972 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
14973 {
14974 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14975 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
14976 return fMxcsr | X86_MXCSR_IE;
14977 }
14978
14979 RTFLOAT32U r32Src1, r32Src2;
14980 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14981 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14982 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
14983 {
14984 *pr32Res = r32Src2;
14985 return fMxcsr;
14986 }
14987
14988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14989 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14990 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
14991 fLe
14992 ? iemFpSoftF32FromIprt(&r32Src1)
14993 : iemFpSoftF32FromIprt(&r32Src2),
14994 pr32Res, fMxcsr);
14995}
14996
14997
14998IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14999{
15000 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15001 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15002 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15003 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15004}
15005#endif
15006
15007
15008/**
15009 * MINSS
15010 */
15011#ifdef IEM_WITHOUT_ASSEMBLY
15012IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15013{
15014 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15015 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15016 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15017 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15018}
15019#endif
15020
15021
15022/**
15023 * MINPD
15024 */
15025#ifdef IEM_WITHOUT_ASSEMBLY
15026static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15027{
15028 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15029 {
15030 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15031 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15032 return fMxcsr | X86_MXCSR_IE;
15033 }
15034
15035 RTFLOAT64U r64Src1, r64Src2;
15036 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15037 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15038 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15039 {
15040 *pr64Res = r64Src2;
15041 return fMxcsr;
15042 }
15043
15044 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15045 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15046 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15047 fLe
15048 ? iemFpSoftF64FromIprt(&r64Src1)
15049 : iemFpSoftF64FromIprt(&r64Src2),
15050 pr64Res, fMxcsr);
15051}
15052
15053
15054IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15055{
15056 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15057 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15058}
15059#endif
15060
15061
15062/**
15063 * MINSD
15064 */
15065#ifdef IEM_WITHOUT_ASSEMBLY
15066IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15067{
15068 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15069 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15070}
15071#endif
15072
15073
15074/**
15075 * DIVPS
15076 */
15077#ifdef IEM_WITHOUT_ASSEMBLY
15078static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15079{
15080 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15081 return fMxcsr;
15082
15083 RTFLOAT32U r32Src1, r32Src2;
15084 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15085 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15086 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15087 {
15088 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15089 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15090 {
15091 *pr32Res = g_ar32QNaN[1];
15092 return fMxcsr | X86_MXCSR_IE;
15093 }
15094 else if (RTFLOAT32U_IS_INF(&r32Src1))
15095 {
15096 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15097 return fMxcsr;
15098 }
15099 else
15100 {
15101 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15102 return fMxcsr | X86_MXCSR_ZE;
15103 }
15104 }
15105
15106 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15107 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15108 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15109}
15110
15111
15112IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15113{
15114 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15115 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15116 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15117 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15118}
15119#endif
15120
15121
15122/**
15123 * DIVSS
15124 */
15125#ifdef IEM_WITHOUT_ASSEMBLY
15126IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15127{
15128 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15129 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15130 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15131 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15132}
15133#endif
15134
15135
15136/**
15137 * DIVPD
15138 */
15139#ifdef IEM_WITHOUT_ASSEMBLY
15140static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15141{
15142 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15143 return fMxcsr;
15144
15145 RTFLOAT64U r64Src1, r64Src2;
15146 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15147 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15148 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15149 {
15150 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15151 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15152 {
15153 *pr64Res = g_ar64QNaN[1];
15154 return fMxcsr | X86_MXCSR_IE;
15155 }
15156 else if (RTFLOAT64U_IS_INF(&r64Src1))
15157 {
15158 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15159 return fMxcsr;
15160 }
15161 else
15162 {
15163 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15164 return fMxcsr | X86_MXCSR_ZE;
15165 }
15166 }
15167
15168 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15169 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15170 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15171}
15172
15173
15174IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15175{
15176 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15177 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15178}
15179#endif
15180
15181
15182/**
15183 * DIVSD
15184 */
15185#ifdef IEM_WITHOUT_ASSEMBLY
15186IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15187{
15188 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15189 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15190}
15191#endif
15192
15193
15194/**
15195 * MAXPS
15196 */
15197#ifdef IEM_WITHOUT_ASSEMBLY
15198static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15199{
15200 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15201 {
15202 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15203 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15204 return fMxcsr | X86_MXCSR_IE;
15205 }
15206
15207 RTFLOAT32U r32Src1, r32Src2;
15208 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15209 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15210 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15211 {
15212 *pr32Res = r32Src2;
15213 return fMxcsr;
15214 }
15215
15216 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15217 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15218 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15219 fLe
15220 ? iemFpSoftF32FromIprt(&r32Src2)
15221 : iemFpSoftF32FromIprt(&r32Src1),
15222 pr32Res, fMxcsr);
15223}
15224
15225
15226IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15227{
15228 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15229 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15230 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15231 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15232}
15233#endif
15234
15235
15236/**
15237 * MAXSS
15238 */
15239#ifdef IEM_WITHOUT_ASSEMBLY
15240IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15241{
15242 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15243 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15244 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15245 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15246}
15247#endif
15248
15249
15250/**
15251 * MAXPD
15252 */
15253#ifdef IEM_WITHOUT_ASSEMBLY
15254static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15255{
15256 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15257 {
15258 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15259 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15260 return fMxcsr | X86_MXCSR_IE;
15261 }
15262
15263 RTFLOAT64U r64Src1, r64Src2;
15264 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15265 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15266 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15267 {
15268 *pr64Res = r64Src2;
15269 return fMxcsr;
15270 }
15271
15272 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15273 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15274 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15275 fLe
15276 ? iemFpSoftF64FromIprt(&r64Src2)
15277 : iemFpSoftF64FromIprt(&r64Src1),
15278 pr64Res, fMxcsr);
15279}
15280
15281
15282IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15283{
15284 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15285 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15286}
15287#endif
15288
15289
15290/**
15291 * MAXSD
15292 */
15293#ifdef IEM_WITHOUT_ASSEMBLY
15294IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15295{
15296 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15297 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15298}
15299#endif
15300
15301
15302/**
15303 * CVTSS2SD
15304 */
15305#ifdef IEM_WITHOUT_ASSEMBLY
15306static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15307{
15308 RTFLOAT32U r32Src1;
15309 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15310
15311 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15312 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15313 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15314}
15315
15316
15317IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15318{
15319 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15320 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15321}
15322#endif
15323
15324
15325/**
15326 * CVTSD2SS
15327 */
15328#ifdef IEM_WITHOUT_ASSEMBLY
15329static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15330{
15331 RTFLOAT64U r64Src1;
15332 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15333
15334 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15335 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15336 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15337}
15338
15339
15340IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15341{
15342 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15343 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15344 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15345 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15346}
15347#endif
15348
15349
15350/**
15351 * HADDPS
15352 */
15353#ifdef IEM_WITHOUT_ASSEMBLY
15354IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15355{
15356 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15357 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15358 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15359 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15360}
15361#endif
15362
15363
15364/**
15365 * HADDPD
15366 */
15367#ifdef IEM_WITHOUT_ASSEMBLY
15368IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15369{
15370 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15371 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15372}
15373#endif
15374
15375
15376/**
15377 * HSUBPS
15378 */
15379#ifdef IEM_WITHOUT_ASSEMBLY
15380IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15381{
15382 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15383 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15384 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15385 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15386}
15387#endif
15388
15389
15390/**
15391 * HSUBPD
15392 */
15393#ifdef IEM_WITHOUT_ASSEMBLY
15394IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15395{
15396 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15397 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15398}
15399#endif
15400
15401
15402/**
15403 * SQRTPS
15404 */
15405#ifdef IEM_WITHOUT_ASSEMBLY
15406static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15407{
15408 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15409 return fMxcsr;
15410
15411 RTFLOAT32U r32Src;
15412 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15413 if (RTFLOAT32U_IS_ZERO(&r32Src))
15414 {
15415 *pr32Res = r32Src;
15416 return fMxcsr;
15417 }
15418 else if (r32Src.s.fSign)
15419 {
15420 *pr32Res = g_ar32QNaN[1];
15421 return fMxcsr | X86_MXCSR_IE;
15422 }
15423
15424 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15425 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15426 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15427}
15428
15429
15430IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15431{
15432 RT_NOREF(puSrc1);
15433
15434 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15435 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15436 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15437 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15438}
15439#endif
15440
15441
15442/**
15443 * SQRTSS
15444 */
15445#ifdef IEM_WITHOUT_ASSEMBLY
15446IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15447{
15448 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15449 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15450 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15451 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15452}
15453#endif
15454
15455
15456/**
15457 * SQRTPD
15458 */
15459#ifdef IEM_WITHOUT_ASSEMBLY
15460static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15461{
15462 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15463 return fMxcsr;
15464
15465 RTFLOAT64U r64Src;
15466 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15467 if (RTFLOAT64U_IS_ZERO(&r64Src))
15468 {
15469 *pr64Res = r64Src;
15470 return fMxcsr;
15471 }
15472 else if (r64Src.s.fSign)
15473 {
15474 *pr64Res = g_ar64QNaN[1];
15475 return fMxcsr | X86_MXCSR_IE;
15476 }
15477
15478 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15479 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15480 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15481}
15482
15483
15484IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15485{
15486 RT_NOREF(puSrc1);
15487
15488 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15489 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15490}
15491#endif
15492
15493
15494/**
15495 * SQRTSD
15496 */
15497#ifdef IEM_WITHOUT_ASSEMBLY
15498IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15499{
15500 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15501 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15502}
15503#endif
15504
15505
15506/**
15507 * ADDSUBPS
15508 */
15509#ifdef IEM_WITHOUT_ASSEMBLY
15510IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15511{
15512 RT_NOREF(puSrc1);
15513
15514 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15515 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15516 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15517 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15518}
15519#endif
15520
15521
15522/**
15523 * ADDSUBPD
15524 */
15525#ifdef IEM_WITHOUT_ASSEMBLY
15526IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15527{
15528 RT_NOREF(puSrc1);
15529
15530 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15531 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15532}
15533#endif
15534
15535
15536/**
15537 * CVTPD2PS
15538 */
15539#ifdef IEM_WITHOUT_ASSEMBLY
15540static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15541{
15542 RTFLOAT64U r64Src1;
15543 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15544
15545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15546 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15547 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15548}
15549
15550
15551IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15552{
15553 RT_NOREF(puSrc1);
15554
15555 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15556 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15557 pResult->uResult.au32[2] = 0;
15558 pResult->uResult.au32[3] = 0;
15559}
15560#endif
15561
15562
15563/**
15564 * CVTPS2PD
15565 */
15566#ifdef IEM_WITHOUT_ASSEMBLY
15567static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15568{
15569 RTFLOAT32U r32Src1;
15570 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15571
15572 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15573 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15574 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15575}
15576
15577
15578IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15579{
15580 RT_NOREF(puSrc1);
15581
15582 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15583 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15584}
15585#endif
15586
15587
15588/**
15589 * CVTDQ2PS
15590 */
15591#ifdef IEM_WITHOUT_ASSEMBLY
15592static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15593{
15594 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15595 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15596 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15597}
15598
15599
15600IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15601{
15602 RT_NOREF(puSrc1);
15603
15604 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15605 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15606 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15607 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15608}
15609#endif
15610
15611
15612/**
15613 * CVTPS2DQ
15614 */
15615#ifdef IEM_WITHOUT_ASSEMBLY
15616static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15617{
15618 RTFLOAT32U r32Src;
15619 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15620
15621 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15622 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15623 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15624}
15625
15626
15627IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15628{
15629 RT_NOREF(puSrc1);
15630
15631 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15632 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15633 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15634 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15635}
15636#endif
15637
15638
15639/**
15640 * CVTTPS2DQ
15641 */
15642#ifdef IEM_WITHOUT_ASSEMBLY
15643static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15644{
15645 RTFLOAT32U r32Src;
15646 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15647
15648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15649 SoftState.roundingMode = softfloat_round_minMag;
15650 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15651 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15652}
15653
15654
15655IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15656{
15657 RT_NOREF(puSrc1);
15658
15659 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15660 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15661 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15662 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15663}
15664#endif
15665
15666
15667/**
15668 * CVTTPD2DQ
15669 */
15670#ifdef IEM_WITHOUT_ASSEMBLY
15671static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15672{
15673 RTFLOAT64U r64Src;
15674 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15675
15676 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15677 SoftState.roundingMode = softfloat_round_minMag;
15678 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15679 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15680}
15681
15682
15683IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15684{
15685 RT_NOREF(puSrc1);
15686
15687 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15688 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15689 pResult->uResult.au64[1] = 0;
15690}
15691#endif
15692
15693
15694/**
15695 * CVTDQ2PD
15696 */
15697#ifdef IEM_WITHOUT_ASSEMBLY
15698static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15699{
15700 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15701 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15702 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15703}
15704
15705
15706IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15707{
15708 RT_NOREF(puSrc1);
15709
15710 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15711 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15712}
15713#endif
15714
15715
15716/**
15717 * CVTPD2DQ
15718 */
15719#ifdef IEM_WITHOUT_ASSEMBLY
15720static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15721{
15722 RTFLOAT64U r64Src;
15723 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15724
15725 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15726 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15727 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15728}
15729
15730
15731IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15732{
15733 RT_NOREF(puSrc1);
15734
15735 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15736 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15737 pResult->uResult.au64[1] = 0;
15738}
15739#endif
15740
15741
15742/**
15743 * [V]SHUFPS
15744 */
15745#ifdef IEM_WITHOUT_ASSEMBLY
15746IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15747{
15748 RTUINT128U const uSrc1 = *puDst;
15749 RTUINT128U const uSrc2 = *puSrc;
15750 ASMCompilerBarrier();
15751 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15752 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15753 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15754 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15755}
15756#endif
15757
15758
15759IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15760{
15761 RTUINT128U const uSrc1 = *puSrc1;
15762 RTUINT128U const uSrc2 = *puSrc2;
15763 ASMCompilerBarrier();
15764 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15765 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15766 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15767 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15768}
15769
15770
15771IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15772{
15773 RTUINT256U const uSrc1 = *puSrc1;
15774 RTUINT256U const uSrc2 = *puSrc2;
15775 ASMCompilerBarrier();
15776 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15777 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15778 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15779 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15780
15781 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15782 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15783 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15784 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15785}
15786
15787
15788/**
15789 * [V]SHUFPD
15790 */
15791#ifdef IEM_WITHOUT_ASSEMBLY
15792IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15793{
15794 RTUINT128U const uSrc1 = *puDst;
15795 RTUINT128U const uSrc2 = *puSrc;
15796 ASMCompilerBarrier();
15797 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15798 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15799}
15800#endif
15801
15802
15803IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15804{
15805 RTUINT128U const uSrc1 = *puSrc1;
15806 RTUINT128U const uSrc2 = *puSrc2;
15807 ASMCompilerBarrier();
15808 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15809 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15810}
15811
15812
15813IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15814{
15815 RTUINT256U const uSrc1 = *puSrc1;
15816 RTUINT256U const uSrc2 = *puSrc2;
15817 ASMCompilerBarrier();
15818 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15819 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15820 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15821 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15822}
15823
15824
15825/*
15826 * PHMINPOSUW / VPHMINPOSUW
15827 */
15828IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15829{
15830 uint16_t u16Min = puSrc->au16[0];
15831 uint8_t idxMin = 0;
15832
15833 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15834 if (puSrc->au16[i] < u16Min)
15835 {
15836 u16Min = puSrc->au16[i];
15837 idxMin = i;
15838 }
15839
15840 puDst->au64[0] = 0;
15841 puDst->au64[1] = 0;
15842 puDst->au16[0] = u16Min;
15843 puDst->au16[1] = idxMin;
15844}
15845
15846
15847IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15848{
15849 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15850}
15851
15852
15853/*
15854 * [V]PBLENDVB
15855 */
15856IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15857{
15858 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15859 if (puMask->au8[i] & RT_BIT(7))
15860 puDst->au8[i] = puSrc->au8[i];
15861}
15862
15863
15864IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15865{
15866 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15867 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15868}
15869
15870
15871IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15872{
15873 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15874 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15875}
15876
15877
15878/*
15879 * [V]BLENDVPS
15880 */
15881IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15882{
15883 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15884 if (puMask->au32[i] & RT_BIT_32(31))
15885 puDst->au32[i] = puSrc->au32[i];
15886}
15887
15888
15889IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15890{
15891 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15892 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15893}
15894
15895
15896IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15897{
15898 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15899 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15900}
15901
15902
15903/*
15904 * [V]BLENDVPD
15905 */
15906IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15907{
15908 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
15909 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
15910}
15911
15912
15913IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15914{
15915 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15916 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15917}
15918
15919
15920IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15921{
15922 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15923 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15924}
15925
15926
15927/**
15928 * [V]PALIGNR
15929 */
15930IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
15931{
15932 uint64_t const u64Src1 = *pu64Dst;
15933 ASMCompilerBarrier();
15934
15935 if (bEvil >= 16)
15936 *pu64Dst = 0;
15937 else if (bEvil >= 8)
15938 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
15939 else
15940 {
15941 uint8_t cShift = bEvil * 8;
15942 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
15943 | (u64Src2 >> cShift);
15944 }
15945}
15946
15947
15948IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15949{
15950 RTUINT128U const uSrc1 = *puDst;
15951 RTUINT128U const uSrc2 = *puSrc;
15952 ASMCompilerBarrier();
15953
15954 puDst->au64[0] = 0;
15955 puDst->au64[1] = 0;
15956 if (bEvil >= 32)
15957 { /* Everything stays 0. */ }
15958 else if (bEvil >= 16)
15959 {
15960 bEvil -= 16;
15961 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15962 puDst->au8[i - bEvil] = uSrc1.au8[i];
15963 }
15964 else
15965 {
15966 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15967 puDst->au8[i] = uSrc2.au8[i + bEvil];
15968 for (uint8_t i = 0; i < bEvil; i++)
15969 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15970 }
15971}
15972
15973
15974IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15975{
15976 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
15977 RTUINT128U const uSrc2 = *puSrc2;
15978 ASMCompilerBarrier();
15979
15980 puDst->au64[0] = 0;
15981 puDst->au64[1] = 0;
15982 if (bEvil >= 32)
15983 { /* Everything stays 0. */ }
15984 else if (bEvil >= 16)
15985 {
15986 bEvil -= 16;
15987 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15988 puDst->au8[i - bEvil] = uSrc1.au8[i];
15989 }
15990 else
15991 {
15992 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15993 puDst->au8[i] = uSrc2.au8[i + bEvil];
15994 for (uint8_t i = 0; i < bEvil; i++)
15995 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15996 }
15997}
15998
15999
16000IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16001{
16002 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16003 RTUINT256U const uSrc2 = *puSrc2;
16004 ASMCompilerBarrier();
16005
16006 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16007 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16008}
16009
16010
16011/**
16012 * [V]PBLENDW
16013 */
16014IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16015{
16016 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16017 if (bEvil & RT_BIT(i))
16018 puDst->au16[i] = puSrc->au16[i];
16019}
16020
16021
16022IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16023{
16024 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16025 if (bEvil & RT_BIT(i))
16026 puDst->au16[i] = puSrc2->au16[i];
16027 else
16028 puDst->au16[i] = puSrc1->au16[i];
16029}
16030
16031
16032IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16033{
16034 for (uint8_t i = 0; i < 8; i++)
16035 if (bEvil & RT_BIT(i))
16036 {
16037 puDst->au16[ i] = puSrc2->au16[ i];
16038 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16039 }
16040 else
16041 {
16042 puDst->au16[ i] = puSrc1->au16[ i];
16043 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16044 }
16045}
16046
16047
16048/**
16049 * [V]BLENDPS
16050 */
16051IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16052{
16053 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16054 if (bEvil & RT_BIT(i))
16055 puDst->au32[i] = puSrc->au32[i];
16056}
16057
16058
16059IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16060{
16061 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16062 if (bEvil & RT_BIT(i))
16063 puDst->au32[i] = puSrc2->au32[i];
16064 else
16065 puDst->au32[i] = puSrc1->au32[i];
16066}
16067
16068
16069IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16070{
16071 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16072 if (bEvil & RT_BIT(i))
16073 puDst->au32[i] = puSrc2->au32[i];
16074 else
16075 puDst->au32[i] = puSrc1->au32[i];
16076}
16077
16078
16079/**
16080 * [V]BLENDPD
16081 */
16082IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16083{
16084 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16085 if (bEvil & RT_BIT(i))
16086 puDst->au64[i] = puSrc->au64[i];
16087}
16088
16089
16090IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16091{
16092 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16093 if (bEvil & RT_BIT(i))
16094 puDst->au64[i] = puSrc2->au64[i];
16095 else
16096 puDst->au64[i] = puSrc1->au64[i];
16097}
16098
16099
16100IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16101{
16102 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16103 if (bEvil & RT_BIT(i))
16104 puDst->au64[i] = puSrc2->au64[i];
16105 else
16106 puDst->au64[i] = puSrc1->au64[i];
16107}
16108
16109
16110/**
16111 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16112 */
16113
16114static uint8_t iemAImpl_aes_sbox[] = {
16115 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16116 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16117 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16118 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16119 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16120 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16121 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16122 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16123 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16124 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16125 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16126 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16127 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16128 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16129 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16130 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16131};
16132
16133/* The InvS-Box lookup table. */
16134static uint8_t iemAImpl_aes_inv_sbox[] = {
16135 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16136 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16137 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16138 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16139 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16140 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16141 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16142 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16143 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16144 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16145 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16146 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16147 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16148 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16149 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16150 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16151};
16152
16153/* The ShiftRows lookup table. */
16154static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16155 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16156};
16157
16158/* The InvShiftRows lookup table. */
16159static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16160 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16161};
16162
16163static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16164{
16165 RTUINT128U uVal;
16166 int i;
16167
16168 for (i = 0; i < 16; ++i)
16169 uVal.au8[i] = abSubst[puSrc->au8[i]];
16170
16171 return uVal;
16172}
16173
16174static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16175{
16176 return (u << 1) ^ (((u >> 7) & 1) * 27);
16177}
16178
16179static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16180{
16181 RTUINT128U uVal;
16182 int i;
16183 uint8_t tmp;
16184
16185 for (i = 0; i < 16; i += 4) {
16186 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16187 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16188 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16189 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16190 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16191 }
16192
16193 return uVal;
16194}
16195
16196static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16197{
16198 RTUINT128U uVal;
16199 int i;
16200
16201 for (i = 0; i < 16; ++i)
16202 uVal.au8[i] = puSrc->au8[abShift[i]];
16203
16204 return uVal;
16205}
16206
16207static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16208{
16209 uint8_t val;
16210
16211 val = ((b >> 0) & 1) * a;
16212 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16213 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16214 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16215 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16216
16217 return val;
16218}
16219
16220static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16221{
16222 RTUINT128U uVal;
16223 int i;
16224
16225 for (i = 0; i < 16; i += 4) {
16226 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16227 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16228 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16229 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16230 }
16231
16232 return uVal;
16233}
16234
16235static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16236{
16237 RTUINT32U uTmp;
16238
16239 uTmp.au32[0] = w;
16240 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16241 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16242 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16243 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16244
16245 return uTmp.au32[0];
16246}
16247
16248static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16249{
16250 return (w << 24) | (w >> 8);
16251}
16252
16253/**
16254 * [V]AESKEYGENASSIST
16255 */
16256IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16257{
16258 RTUINT128U uTmp;
16259 uint32_t uRCon = bImm; /* Round constant. */
16260
16261 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16262 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16263 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16264 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16265
16266 *puDst = uTmp;
16267}
16268
16269
16270/**
16271 * [V]AESIMC
16272 */
16273IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16274{
16275 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16276}
16277
16278
16279/**
16280 * [V]AESENC
16281 */
16282IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16283{
16284 RTUINT128U uTmp;
16285
16286 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16287 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16288 uTmp = iemAImpl_aes_mix_col(&uTmp);
16289 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16290 uTmp.au64[1] ^= puSrc->au64[1];
16291
16292 *puDst = uTmp;
16293}
16294
16295
16296/**
16297 * [V]AESENCLAST
16298 */
16299IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16300{
16301 RTUINT128U uTmp;
16302
16303 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16304 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16305 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16306 uTmp.au64[1] ^= puSrc->au64[1];
16307
16308 *puDst = uTmp;
16309}
16310
16311
16312/**
16313 * [V]AESDEC
16314 */
16315IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16316{
16317 RTUINT128U uTmp;
16318
16319 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16320 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16321 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16322 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16323 uTmp.au64[1] ^= puSrc->au64[1];
16324
16325 *puDst = uTmp;
16326}
16327
16328
16329/**
16330 * [V]AESDECLAST
16331 */
16332IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16333{
16334 RTUINT128U uTmp;
16335
16336 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16337 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16338 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16339 uTmp.au64[1] ^= puSrc->au64[1];
16340
16341 *puDst = uTmp;
16342}
16343
16344
16345/**
16346 * [V]PCMPISTRI
16347 */
16348IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRISRC pSrc, uint8_t bEvil))
16349{
16350 RT_NOREF(pu32Ecx, pEFlags, pSrc, bEvil);
16351 AssertReleaseFailed();
16352}
16353
16354
16355/*
16356 * [V]PCLMULQDQ
16357 */
16358IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16359{
16360 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16361}
16362
16363
16364IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16365{
16366 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16367 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16368
16369 puDst->au64[0] = 0;
16370 puDst->au64[1] = 0;
16371
16372 /*
16373 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16374 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16375 * and squeeze out some optimizations.
16376 */
16377 if (uSrc1 & 0x1)
16378 puDst->au64[0] = uSrc2;
16379
16380 uSrc1 >>= 1;
16381
16382 uint8_t iDigit = 1;
16383 while (uSrc1)
16384 {
16385 if (uSrc1 & 0x1)
16386 {
16387 puDst->au64[0] ^= (uSrc2 << iDigit);
16388 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16389 }
16390
16391 uSrc1 >>= 1;
16392 iDigit++;
16393 }
16394}
16395
16396
16397/**
16398 * [V]PINSRW
16399 */
16400#ifdef IEM_WITHOUT_ASSEMBLY
16401IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16402{
16403 uint8_t cShift = (bEvil & 0x3) * 16;
16404 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16405}
16406
16407
16408IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16409{
16410 puDst->au16[bEvil & 0x7] = u16Src;
16411}
16412#endif
16413
16414
16415IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16416{
16417 *puDst = *puSrc;
16418 puDst->au16[bEvil & 0x7] = u16Src;
16419}
16420
16421
16422/**
16423 * [V]PEXTRW
16424 */
16425#ifdef IEM_WITHOUT_ASSEMBLY
16426IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16427{
16428 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16429}
16430
16431
16432IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16433{
16434 *pu16Dst = puSrc->au16[bEvil & 0x7];
16435}
16436
16437#endif
16438
16439IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16440{
16441 *pu16Dst = puSrc->au16[bEvil & 0x7];
16442}
16443
16444
16445/**
16446 * [V]MOVMSKPS
16447 */
16448#ifdef IEM_WITHOUT_ASSEMBLY
16449IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16450{
16451 *pu8Dst = puSrc->au32[0] >> 31;
16452 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16453 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16454 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16455}
16456
16457#endif
16458
16459IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16460{
16461 *pu8Dst = puSrc->au32[0] >> 31;
16462 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16463 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16464 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16465}
16466
16467
16468IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16469{
16470 *pu8Dst = puSrc->au32[0] >> 31;
16471 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16472 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16473 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16474 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
16475 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
16476 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
16477 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
16478}
16479
16480
16481/**
16482 * [V]MOVMSKPD
16483 */
16484#ifdef IEM_WITHOUT_ASSEMBLY
16485IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16486{
16487 *pu8Dst = puSrc->au64[0] >> 63;
16488 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16489}
16490
16491#endif
16492
16493IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16494{
16495 *pu8Dst = puSrc->au64[0] >> 63;
16496 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16497}
16498
16499
16500IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16501{
16502 *pu8Dst = puSrc->au64[0] >> 63;
16503 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16504 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
16505 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
16506}
16507
16508
16509/**
16510 * CVTTSD2SI
16511 */
16512#ifdef IEM_WITHOUT_ASSEMBLY
16513IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16514{
16515 RTFLOAT64U r64Src;
16516
16517 r64Src.u = *pu64Src;
16518 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16519
16520 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16521 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16522 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16523}
16524
16525
16526IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16527{
16528 RTFLOAT64U r64Src;
16529
16530 r64Src.u = *pu64Src;
16531 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16532
16533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16534 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16535 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16536}
16537#endif
16538
16539
16540/**
16541 * CVTSD2SI
16542 */
16543#ifdef IEM_WITHOUT_ASSEMBLY
16544IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16545{
16546 RTFLOAT64U r64Src;
16547
16548 r64Src.u = *pu64Src;
16549 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16550
16551 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16552 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16553 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16554}
16555
16556
16557IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16558{
16559 RTFLOAT64U r64Src;
16560
16561 r64Src.u = *pu64Src;
16562 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16563
16564 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16565 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16566 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16567}
16568#endif
16569
16570
16571/**
16572 * CVTTSS2SI
16573 */
16574#ifdef IEM_WITHOUT_ASSEMBLY
16575IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16576{
16577 RTFLOAT32U r32Src;
16578
16579 r32Src.u = *pu32Src;
16580 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16581
16582 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16583 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16584 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16585}
16586
16587
16588IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16589{
16590 RTFLOAT32U r32Src;
16591
16592 r32Src.u = *pu32Src;
16593 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16594
16595 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16596 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16597 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16598}
16599#endif
16600
16601
16602/**
16603 * CVTSS2SI
16604 */
16605#ifdef IEM_WITHOUT_ASSEMBLY
16606IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16607{
16608 RTFLOAT32U r32Src;
16609
16610 r32Src.u = *pu32Src;
16611 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16612
16613 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16614 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16615 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16616}
16617
16618
16619IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16620{
16621 RTFLOAT32U r32Src;
16622
16623 r32Src.u = *pu32Src;
16624 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16625
16626 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16627 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16628 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16629}
16630#endif
16631
16632
16633/**
16634 * CVTSI2SD
16635 */
16636#ifdef IEM_WITHOUT_ASSEMBLY
16637IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
16638{
16639 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16640 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
16641 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16642}
16643
16644
16645IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
16646{
16647 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16648 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
16649 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16650}
16651#endif
16652
16653
16654/**
16655 * CVTSI2SS
16656 */
16657#ifdef IEM_WITHOUT_ASSEMBLY
16658IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
16659{
16660 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16661 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
16662 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16663}
16664
16665
16666IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
16667{
16668 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16669 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
16670 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16671}
16672#endif
16673
16674
16675/**
16676 * [V]UCOMISS
16677 */
16678#ifdef IEM_WITHOUT_ASSEMBLY
16679IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16680{
16681 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16682
16683 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
16684 {
16685 *pfMxcsr |= X86_MXCSR_IE;
16686 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16687 }
16688 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16689 {
16690 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16691 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16692 }
16693 else
16694 {
16695 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16696
16697 RTFLOAT32U r32Src1, r32Src2;
16698 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16699 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16700
16701 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16702 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16703 if (f32_eq(f32Src1, f32Src2, &SoftState))
16704 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16705 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16706 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16707 /* else: GREATER_THAN 000 */
16708
16709 *pfMxcsr |= fDe;
16710 }
16711
16712 *pfEFlags = fEFlagsNew;
16713}
16714#endif
16715
16716IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16717{
16718 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16719}
16720
16721
16722/**
16723 * [V]UCOMISD
16724 */
16725#ifdef IEM_WITHOUT_ASSEMBLY
16726IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16727{
16728 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16729
16730 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
16731 {
16732 *pfMxcsr |= X86_MXCSR_IE;
16733 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16734 }
16735 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16736 {
16737 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16738 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16739 }
16740 else
16741 {
16742 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16743
16744 RTFLOAT64U r64Src1, r64Src2;
16745 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16746 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16747
16748 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16749 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16750 if (f64_eq(f64Src1, f64Src2, &SoftState))
16751 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16752 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16753 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16754 /* else: GREATER_THAN 000 */
16755
16756 *pfMxcsr |= fDe;
16757 }
16758
16759 *pfEFlags = fEFlagsNew;
16760}
16761#endif
16762
16763IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16764{
16765 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16766}
16767
16768
16769/**
16770 * [V]COMISS
16771 */
16772#ifdef IEM_WITHOUT_ASSEMBLY
16773IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16774{
16775 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16776
16777 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
16778 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16779 {
16780 *pfMxcsr |= X86_MXCSR_IE;
16781 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16782 }
16783 else
16784 {
16785 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16786
16787 RTFLOAT32U r32Src1, r32Src2;
16788 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16789 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16790
16791 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16792 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16793 if (f32_eq(f32Src1, f32Src2, &SoftState))
16794 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16795 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16796 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16797 /* else: GREATER_THAN 000 */
16798
16799 *pfMxcsr |= fDe;
16800 }
16801
16802 *pfEFlags = fEFlagsNew;
16803}
16804#endif
16805
16806
16807IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16808{
16809 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16810}
16811
16812
16813/**
16814 * [V]COMISD
16815 */
16816#ifdef IEM_WITHOUT_ASSEMBLY
16817IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16818{
16819 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16820
16821 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
16822 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16823 {
16824 *pfMxcsr |= X86_MXCSR_IE;
16825 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16826 }
16827 else
16828 {
16829 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16830
16831 RTFLOAT64U r64Src1, r64Src2;
16832 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16833 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16834
16835 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16836 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16837 if (f64_eq(f64Src1, f64Src2, &SoftState))
16838 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16839 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16840 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16841 /* else: GREATER_THAN 000 */
16842
16843 *pfMxcsr |= fDe;
16844 }
16845
16846 *pfEFlags = fEFlagsNew;
16847}
16848#endif
16849
16850IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16851{
16852 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16853}
16854
16855
16856/**
16857 * CMPPS / CMPPD / CMPSS / CMPSD
16858 */
16859#ifdef IEM_WITHOUT_ASSEMBLY
16860/**
16861 * A compare truth table entry.
16862 */
16863typedef struct CMPTRUTHTBLENTRY
16864{
16865 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
16866 bool fSignalsOnQNan;
16867 /** The boolean result when the input operands are unordered. */
16868 bool fUnordered;
16869 /** The boolean result when A = B. */
16870 bool fEqual;
16871 /** The boolean result when A < B. */
16872 bool fLowerThan;
16873 /** The boolean result when A > B. */
16874 bool fGreaterThan;
16875} CMPTRUTHTBLENTRY;
16876/** Pointer to a const truth table entry. */
16877typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
16878
16879
16880/** The compare truth table (indexed by immediate). */
16881static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
16882{
16883 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
16884 /* 00H (EQ_OQ) */ { false, false, true, false, false },
16885 /* 01H (LT_OS) */ { true, false, false, true, false },
16886 /* 02H (LE_OS) */ { true, false, true, true, false },
16887 /* 03H (UNORD_Q) */ { false, true, false, false, false },
16888 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
16889 /* 05H (NLT_US) */ { true, true, true, false, true },
16890 /* 06H (NLE_US) */ { true, true, false, false, true },
16891 /* 07H (ORQ_Q) */ { false, false, true, true, true },
16892 /** @todo AVX variants. */
16893};
16894
16895
16896static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
16897{
16898 bool fRes;
16899 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16900
16901 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
16902 {
16903 *pfMxcsr |= X86_MXCSR_IE;
16904 fRes = g_aCmpTbl[bEvil].fUnordered;
16905 }
16906 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
16907 {
16908 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16909 *pfMxcsr |= X86_MXCSR_IE;
16910 fRes = g_aCmpTbl[bEvil].fUnordered;
16911 }
16912 else
16913 {
16914 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16915
16916 RTFLOAT32U r32Src1, r32Src2;
16917 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
16918 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
16919
16920 *pfMxcsr |= fDe;
16921 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16922 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16923 if (f32_eq(f32Src1, f32Src2, &SoftState))
16924 fRes = g_aCmpTbl[bEvil].fEqual;
16925 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16926 fRes = g_aCmpTbl[bEvil].fLowerThan;
16927 else
16928 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16929 }
16930
16931 return fRes;
16932}
16933
16934
16935static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
16936{
16937 bool fRes;
16938 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16939
16940 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
16941 {
16942 *pfMxcsr |= X86_MXCSR_IE;
16943 fRes = g_aCmpTbl[bEvil].fUnordered;
16944 }
16945 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
16946 {
16947 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16948 *pfMxcsr |= X86_MXCSR_IE;
16949 fRes = g_aCmpTbl[bEvil].fUnordered;
16950 }
16951 else
16952 {
16953 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16954
16955 RTFLOAT64U r64Src1, r64Src2;
16956 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1);
16957 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
16958
16959 *pfMxcsr |= fDe;
16960 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16961 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16962 if (f64_eq(f64Src1, f64Src2, &SoftState))
16963 fRes = g_aCmpTbl[bEvil].fEqual;
16964 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16965 fRes = g_aCmpTbl[bEvil].fLowerThan;
16966 else
16967 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16968 }
16969
16970 return fRes;
16971}
16972
16973
16974IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16975{
16976 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
16977 {
16978 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
16979 puDst->au32[i] = UINT32_MAX;
16980 else
16981 puDst->au32[i] = 0;
16982 }
16983}
16984
16985
16986IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16987{
16988 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
16989 {
16990 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
16991 puDst->au64[i] = UINT64_MAX;
16992 else
16993 puDst->au64[i] = 0;
16994 }
16995}
16996
16997
16998IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16999{
17000 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17001 puDst->au32[0] = UINT32_MAX;
17002 else
17003 puDst->au32[0] = 0;
17004
17005 puDst->au32[1] = pSrc->uSrc1.au32[1];
17006 puDst->au64[1] = pSrc->uSrc1.au64[1];
17007}
17008
17009
17010IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17011{
17012 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17013 puDst->au64[0] = UINT64_MAX;
17014 else
17015 puDst->au64[0] = 0;
17016
17017 puDst->au64[1] = pSrc->uSrc1.au64[1];
17018}
17019#endif
17020
17021
17022/**
17023 * CVTPD2PI
17024 */
17025#ifdef IEM_WITHOUT_ASSEMBLY
17026static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17027{
17028 RTFLOAT64U r64Src;
17029 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17030
17031 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17032 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17033 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17034}
17035
17036
17037IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17038{
17039 RTUINT64U u64Res;
17040 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17041 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17042
17043 *pu64Dst = u64Res.u;
17044 *pfMxcsr = fMxcsrOut;
17045}
17046#endif
17047
17048
17049/**
17050 * CVTTPD2PI
17051 */
17052#ifdef IEM_WITHOUT_ASSEMBLY
17053static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17054{
17055 RTFLOAT64U r64Src;
17056 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17057
17058 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17059 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17060 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17061}
17062
17063
17064IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17065{
17066 RTUINT64U u64Res;
17067 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17068 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17069
17070 *pu64Dst = u64Res.u;
17071 *pfMxcsr = fMxcsrOut;
17072}
17073#endif
17074
17075
17076/**
17077 * CVTPI2PS
17078 */
17079#ifdef IEM_WITHOUT_ASSEMBLY
17080static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17081{
17082 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17083 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17084 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17085}
17086
17087
17088IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17089{
17090 RTUINT64U uSrc = { u64Src };
17091 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17092 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17093 *pfMxcsr = fMxcsrOut;
17094}
17095#endif
17096
17097
17098/**
17099 * CVTPI2PD
17100 */
17101#ifdef IEM_WITHOUT_ASSEMBLY
17102static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17103{
17104 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17105 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17106 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17107}
17108
17109
17110IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17111{
17112 RTUINT64U uSrc = { u64Src };
17113 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17114 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17115 *pfMxcsr = fMxcsrOut;
17116}
17117#endif
17118
17119
17120/**
17121 * CVTPS2PI
17122 */
17123#ifdef IEM_WITHOUT_ASSEMBLY
17124static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17125{
17126 RTFLOAT32U r32Src;
17127 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17128
17129 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17130 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17131 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17132}
17133
17134
17135IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17136{
17137 RTUINT64U uDst;
17138 RTUINT64U uSrc = { u64Src };
17139 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17140 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17141 *pu64Dst = uDst.u;
17142 *pfMxcsr = fMxcsrOut;
17143}
17144#endif
17145
17146
17147/**
17148 * CVTTPS2PI
17149 */
17150#ifdef IEM_WITHOUT_ASSEMBLY
17151static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17152{
17153 RTFLOAT32U r32Src;
17154 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17155
17156 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17157 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17158 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17159}
17160
17161
17162IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17163{
17164 RTUINT64U uDst;
17165 RTUINT64U uSrc = { u64Src };
17166 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17167 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17168 *pu64Dst = uDst.u;
17169 *pfMxcsr = fMxcsrOut;
17170}
17171#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette