VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 96686

Last change on this file since 96686 was 96681, checked in by vboxsync, 2 years ago

VMM/IEM: Implement cvtps2pd instruction, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 597.0 KB
Line 
1/* $Id: IEMAllAImplC.cpp 96681 2022-09-09 14:52:20Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123
6124
6125IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6126 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6127{
6128 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6129 AssertReleaseFailed();
6130}
6131
6132#endif /* IEM_WITHOUT_ASSEMBLY */
6133
6134IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6135 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6136{
6137 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6138}
6139
6140IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6141 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6142{
6143 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6144}
6145
6146
6147#if defined(IEM_WITHOUT_ASSEMBLY)
6148IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6149{
6150 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6151 AssertReleaseFailed();
6152}
6153#endif /* IEM_WITHOUT_ASSEMBLY */
6154
6155IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6156{
6157 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6158}
6159
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6163}
6164
6165
6166#ifdef IEM_WITHOUT_ASSEMBLY
6167
6168static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6169{
6170 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6171 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6172 extFloat80_t v;
6173 (void)fFcw;
6174
6175 v = extF80_sin(x, &SoftState);
6176
6177 iemFpuSoftF80ToIprt(pr80Result, v);
6178
6179 return fFsw;
6180}
6181
6182IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6183{
6184 uint16_t const fFcw = pFpuState->FCW;
6185 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6186
6187 if (RTFLOAT80U_IS_ZERO(pr80Val))
6188 {
6189 pFpuRes->r80Result = *pr80Val;
6190 }
6191 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6192 {
6193 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6194 {
6195 fFsw |= X86_FSW_C2;
6196 pFpuRes->r80Result = *pr80Val;
6197 }
6198 else
6199 {
6200 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6201 {
6202 pFpuRes->r80Result = *pr80Val;
6203
6204 }
6205 else
6206 {
6207 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6208 }
6209 fFsw |= X86_FSW_PE;
6210 if (!(fFcw & X86_FCW_PM))
6211 fFsw |= X86_FSW_ES | X86_FSW_B;
6212 }
6213 }
6214 else if (RTFLOAT80U_IS_INF(pr80Val))
6215 {
6216 fFsw |= X86_FSW_IE;
6217 if (!(fFcw & X86_FCW_IM))
6218 {
6219 fFsw |= X86_FSW_ES | X86_FSW_B;
6220 pFpuRes->r80Result = *pr80Val;
6221 }
6222 else
6223 {
6224 pFpuRes->r80Result = g_r80Indefinite;
6225 }
6226 }
6227 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6228 {
6229 pFpuRes->r80Result = *pr80Val;
6230 fFsw |= X86_FSW_DE;
6231
6232 if (fFcw & X86_FCW_DM)
6233 {
6234 fFsw |= X86_FSW_UE | X86_FSW_PE;
6235
6236 if (!(fFcw & X86_FCW_UM) || !(fFcw & X86_FCW_PM))
6237 {
6238 fFsw |= X86_FSW_ES | X86_FSW_B;
6239 }
6240 }
6241 else
6242 {
6243 fFsw |= X86_FSW_ES | X86_FSW_B;
6244 }
6245 }
6246 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6247 {
6248 pFpuRes->r80Result = *pr80Val;
6249 fFsw |= X86_FSW_DE;
6250
6251 if (fFcw & X86_FCW_DM)
6252 {
6253 if (fFcw & X86_FCW_PM)
6254 {
6255 fFsw |= X86_FSW_PE;
6256 }
6257 else
6258 {
6259 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6260 }
6261
6262 pFpuRes->r80Result.sj64.uExponent = 1;
6263 }
6264 else
6265 {
6266 fFsw |= X86_FSW_ES | X86_FSW_B;
6267 }
6268 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6269 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6270 {
6271 pFpuRes->r80Result = *pr80Val;
6272 } else {
6273 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6274 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6275 && (fFcw & X86_FCW_IM))
6276 pFpuRes->r80Result = g_r80Indefinite;
6277 else
6278 {
6279 pFpuRes->r80Result = *pr80Val;
6280 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6281 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6282 }
6283
6284 fFsw |= X86_FSW_IE;
6285 if (!(fFcw & X86_FCW_IM))
6286 fFsw |= X86_FSW_ES | X86_FSW_B;
6287 }
6288
6289 pFpuRes->FSW = fFsw;
6290}
6291#endif /* IEM_WITHOUT_ASSEMBLY */
6292
6293IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6294{
6295 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6296}
6297
6298IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6299{
6300 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6301}
6302
6303#ifdef IEM_WITHOUT_ASSEMBLY
6304
6305static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6306{
6307 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6308 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6309 extFloat80_t v;
6310 (void)fFcw;
6311
6312 v = extF80_cos(x, &SoftState);
6313
6314 iemFpuSoftF80ToIprt(pr80Result, v);
6315
6316 return fFsw;
6317}
6318
6319IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6320{
6321 uint16_t const fFcw = pFpuState->FCW;
6322 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6323
6324 if (RTFLOAT80U_IS_ZERO(pr80Val))
6325 {
6326 pFpuRes->r80Result = g_ar80One[0];
6327 }
6328 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6329 {
6330 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6331 {
6332 fFsw |= X86_FSW_C2;
6333 pFpuRes->r80Result = *pr80Val;
6334 }
6335 else
6336 {
6337 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6338 {
6339 pFpuRes->r80Result = g_ar80One[0];
6340
6341 }
6342 else
6343 {
6344 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6345 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6346 }
6347 fFsw |= X86_FSW_PE;
6348 if (!(fFcw & X86_FCW_PM))
6349 fFsw |= X86_FSW_ES | X86_FSW_B;
6350 }
6351 }
6352 else if (RTFLOAT80U_IS_INF(pr80Val))
6353 {
6354 fFsw |= X86_FSW_IE;
6355 if (!(fFcw & X86_FCW_IM))
6356 {
6357 fFsw |= X86_FSW_ES | X86_FSW_B;
6358 pFpuRes->r80Result = *pr80Val;
6359 }
6360 else
6361 {
6362 pFpuRes->r80Result = g_r80Indefinite;
6363 }
6364 }
6365 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6366 {
6367 fFsw |= X86_FSW_DE;
6368
6369 if (fFcw & X86_FCW_DM)
6370 {
6371 pFpuRes->r80Result = g_ar80One[0];
6372
6373 if (fFcw & X86_FCW_PM)
6374 {
6375 fFsw |= X86_FSW_PE;
6376 }
6377 else
6378 {
6379 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6380 }
6381 }
6382 else
6383 {
6384 pFpuRes->r80Result = *pr80Val;
6385 fFsw |= X86_FSW_ES | X86_FSW_B;
6386 }
6387 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6388 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6389 {
6390 pFpuRes->r80Result = *pr80Val;
6391 } else {
6392 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6393 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6394 && (fFcw & X86_FCW_IM))
6395 pFpuRes->r80Result = g_r80Indefinite;
6396 else
6397 {
6398 pFpuRes->r80Result = *pr80Val;
6399 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6400 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6401 }
6402
6403 fFsw |= X86_FSW_IE;
6404 if (!(fFcw & X86_FCW_IM))
6405 fFsw |= X86_FSW_ES | X86_FSW_B;
6406 }
6407
6408 pFpuRes->FSW = fFsw;
6409}
6410#endif /* IEM_WITHOUT_ASSEMBLY */
6411
6412IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6413{
6414 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6415}
6416
6417IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6418{
6419 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6420}
6421
6422#ifdef IEM_WITHOUT_ASSEMBLY
6423
6424static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6425{
6426 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6427 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6428 extFloat80_t r80Sin, r80Cos;
6429 (void)fFcw;
6430
6431 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6432
6433 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6434 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6435
6436 return fFsw;
6437}
6438
6439IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6440{
6441 uint16_t const fFcw = pFpuState->FCW;
6442 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6443
6444 if (RTFLOAT80U_IS_ZERO(pr80Val))
6445 {
6446 pFpuResTwo->r80Result1 = *pr80Val;
6447 pFpuResTwo->r80Result2 = g_ar80One[0];
6448 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6449 }
6450 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6451 {
6452 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6453 {
6454 fFsw |= X86_FSW_C2;
6455
6456 if (fFcw & X86_FCW_IM)
6457 {
6458 pFpuResTwo->r80Result1 = g_r80Indefinite;
6459 }
6460 else
6461 {
6462 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6463 }
6464
6465 pFpuResTwo->r80Result2 = *pr80Val;
6466 }
6467 else
6468 {
6469 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6470
6471 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6472 {
6473 pFpuResTwo->r80Result1 = *pr80Val;
6474 pFpuResTwo->r80Result2 = g_ar80One[0];
6475 }
6476 else
6477 {
6478 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6479 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6480 }
6481 fFsw |= X86_FSW_PE;
6482 if (!(fFcw & X86_FCW_PM))
6483 fFsw |= X86_FSW_ES | X86_FSW_B;
6484 }
6485 }
6486 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6487 {
6488 fFsw |= X86_FSW_DE;
6489
6490 if (fFcw & X86_FCW_DM)
6491 {
6492 pFpuResTwo->r80Result1 = *pr80Val;
6493 pFpuResTwo->r80Result2 = g_ar80One[0];
6494 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6495
6496 if (fFcw & X86_FCW_PM)
6497 {
6498 fFsw |= X86_FSW_PE;
6499 }
6500 else
6501 {
6502 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6503 }
6504
6505 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6506 }
6507 else
6508 {
6509 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6510 pFpuResTwo->r80Result2 = *pr80Val;
6511 fFsw |= X86_FSW_ES | X86_FSW_B;
6512 }
6513 }
6514 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6515 {
6516 fFsw |= X86_FSW_DE;
6517
6518 if (fFcw & X86_FCW_DM)
6519 {
6520 pFpuResTwo->r80Result1 = *pr80Val;
6521 pFpuResTwo->r80Result2 = g_ar80One[0];
6522
6523 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6524 fFsw |= X86_FSW_UE | X86_FSW_PE;
6525
6526 if (fFcw & X86_FCW_PM)
6527 {
6528 if (!(fFcw & X86_FCW_UM))
6529 fFsw |= X86_FSW_ES | X86_FSW_B;
6530 }
6531 else
6532 {
6533 fFsw |= X86_FSW_ES | X86_FSW_B;
6534 }
6535 }
6536 else
6537 {
6538 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6539 pFpuResTwo->r80Result2 = *pr80Val;
6540 fFsw |= X86_FSW_ES | X86_FSW_B;
6541 }
6542 }
6543 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6544 {
6545 pFpuResTwo->r80Result1 = *pr80Val;
6546 pFpuResTwo->r80Result2 = *pr80Val;
6547 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6548 }
6549 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6550 {
6551 if (fFcw & X86_FCW_IM)
6552 {
6553 pFpuResTwo->r80Result1 = g_r80Indefinite;
6554 pFpuResTwo->r80Result2 = g_r80Indefinite;
6555 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6556 }
6557 else
6558 {
6559 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6560 pFpuResTwo->r80Result2 = *pr80Val;
6561 }
6562
6563 fFsw |= X86_FSW_IE;
6564 if (!(fFcw & X86_FCW_IM))
6565 fFsw |= X86_FSW_ES | X86_FSW_B;
6566 }
6567 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6568 {
6569 pFpuResTwo->r80Result1 = *pr80Val;
6570 pFpuResTwo->r80Result2 = *pr80Val;
6571
6572 if (fFcw & X86_FCW_IM)
6573 {
6574 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6575 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6576 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6577 }
6578 else
6579 {
6580 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6581 pFpuResTwo->r80Result2 = *pr80Val;
6582 }
6583
6584 fFsw |= X86_FSW_IE;
6585 if (!(fFcw & X86_FCW_IM))
6586 fFsw |= X86_FSW_ES | X86_FSW_B;
6587 }
6588 else if (RTFLOAT80U_IS_INF(pr80Val))
6589 {
6590 if (fFcw & X86_FCW_IM)
6591 {
6592 pFpuResTwo->r80Result1 = g_r80Indefinite;
6593 pFpuResTwo->r80Result2 = g_r80Indefinite;
6594 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6595 }
6596 else
6597 {
6598 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6599 pFpuResTwo->r80Result2 = *pr80Val;
6600 }
6601
6602 fFsw |= X86_FSW_IE;
6603 if (!(fFcw & X86_FCW_IM))
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606
6607 pFpuResTwo->FSW = fFsw;
6608}
6609#endif /* IEM_WITHOUT_ASSEMBLY */
6610
6611IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6612{
6613 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6614}
6615
6616IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6617{
6618 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6619}
6620
6621#ifdef IEM_WITHOUT_ASSEMBLY
6622
6623
6624/*********************************************************************************************************************************
6625* x87 FPU Compare and Testing Operations *
6626*********************************************************************************************************************************/
6627
6628IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6629{
6630 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6631
6632 if (RTFLOAT80U_IS_ZERO(pr80Val))
6633 fFsw |= X86_FSW_C3;
6634 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6635 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6636 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6637 {
6638 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6639 if (!(pFpuState->FCW & X86_FCW_DM))
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642 else
6643 {
6644 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6645 if (!(pFpuState->FCW & X86_FCW_IM))
6646 fFsw |= X86_FSW_ES | X86_FSW_B;
6647 }
6648
6649 *pu16Fsw = fFsw;
6650}
6651
6652
6653IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6654{
6655 RT_NOREF(pFpuState);
6656 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6657
6658 /* C1 = sign bit (always, even if empty Intel says). */
6659 if (pr80Val->s.fSign)
6660 fFsw |= X86_FSW_C1;
6661
6662 /* Classify the value in C0, C2, C3. */
6663 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6664 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6665 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6666 fFsw |= X86_FSW_C2;
6667 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6668 fFsw |= X86_FSW_C3;
6669 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6670 fFsw |= X86_FSW_C0;
6671 else if (RTFLOAT80U_IS_INF(pr80Val))
6672 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6673 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6674 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6675 /* whatever else: 0 */
6676
6677 *pu16Fsw = fFsw;
6678}
6679
6680
6681/**
6682 * Worker for fcom, fucom, and friends.
6683 */
6684static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6685 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6686{
6687 /*
6688 * Unpack the values.
6689 */
6690 bool const fSign1 = pr80Val1->s.fSign;
6691 int32_t iExponent1 = pr80Val1->s.uExponent;
6692 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6693
6694 bool const fSign2 = pr80Val2->s.fSign;
6695 int32_t iExponent2 = pr80Val2->s.uExponent;
6696 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6697
6698 /*
6699 * Check for invalid inputs.
6700 */
6701 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6702 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6703 {
6704 if (!(fFcw & X86_FCW_IM))
6705 fFsw |= X86_FSW_ES | X86_FSW_B;
6706 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6707 }
6708
6709 /*
6710 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6711 */
6712 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6713 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6714 {
6715 if ( fIeOnAllNaNs
6716 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6717 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6718 {
6719 fFsw |= X86_FSW_IE;
6720 if (!(fFcw & X86_FCW_IM))
6721 fFsw |= X86_FSW_ES | X86_FSW_B;
6722 }
6723 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6724 }
6725
6726 /*
6727 * Normalize the values.
6728 */
6729 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6730 {
6731 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6732 iExponent1 = 1;
6733 else
6734 {
6735 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6736 uMantissa1 <<= iExponent1;
6737 iExponent1 = 1 - iExponent1;
6738 }
6739 fFsw |= X86_FSW_DE;
6740 if (!(fFcw & X86_FCW_DM))
6741 fFsw |= X86_FSW_ES | X86_FSW_B;
6742 }
6743
6744 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6745 {
6746 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6747 iExponent2 = 1;
6748 else
6749 {
6750 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6751 uMantissa2 <<= iExponent2;
6752 iExponent2 = 1 - iExponent2;
6753 }
6754 fFsw |= X86_FSW_DE;
6755 if (!(fFcw & X86_FCW_DM))
6756 fFsw |= X86_FSW_ES | X86_FSW_B;
6757 }
6758
6759 /*
6760 * Test if equal (val1 == val2):
6761 */
6762 if ( uMantissa1 == uMantissa2
6763 && iExponent1 == iExponent2
6764 && ( fSign1 == fSign2
6765 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6766 fFsw |= X86_FSW_C3;
6767 /*
6768 * Test if less than (val1 < val2):
6769 */
6770 else if (fSign1 && !fSign2)
6771 fFsw |= X86_FSW_C0;
6772 else if (fSign1 == fSign2)
6773 {
6774 /* Zeros are problematic, however at the most one can be zero here. */
6775 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6776 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6777 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6778 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6779
6780 if ( fSign1
6781 ^ ( iExponent1 < iExponent2
6782 || ( iExponent1 == iExponent2
6783 && uMantissa1 < uMantissa2 ) ) )
6784 fFsw |= X86_FSW_C0;
6785 }
6786 /* else: No flags set if greater. */
6787
6788 return fFsw;
6789}
6790
6791
6792IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6793 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6794{
6795 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6796}
6797
6798
6799
6800
6801IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6802 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6803{
6804 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6805}
6806
6807
6808IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6809 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6810{
6811 RTFLOAT80U r80Val2;
6812 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6813 Assert(!fFsw || fFsw == X86_FSW_DE);
6814 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6815 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6816 {
6817 if (!(pFpuState->FCW & X86_FCW_DM))
6818 fFsw |= X86_FSW_ES | X86_FSW_B;
6819 *pfFsw |= fFsw;
6820 }
6821}
6822
6823
6824IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6825 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6826{
6827 RTFLOAT80U r80Val2;
6828 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6829 Assert(!fFsw || fFsw == X86_FSW_DE);
6830 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6831 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6832 {
6833 if (!(pFpuState->FCW & X86_FCW_DM))
6834 fFsw |= X86_FSW_ES | X86_FSW_B;
6835 *pfFsw |= fFsw;
6836 }
6837}
6838
6839
6840IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6841 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6842{
6843 RTFLOAT80U r80Val2;
6844 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6845 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6846}
6847
6848
6849IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6850 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6851{
6852 RTFLOAT80U r80Val2;
6853 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6854 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6855}
6856
6857
6858/**
6859 * Worker for fcomi & fucomi.
6860 */
6861static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6862 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6863{
6864 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6865 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6866 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6867 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6868
6869 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6870 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6871 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6872}
6873
6874
6875IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6876 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6877{
6878 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6879}
6880
6881
6882IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6883 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6884{
6885 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6886}
6887
6888
6889/*********************************************************************************************************************************
6890* x87 FPU Other Operations *
6891*********************************************************************************************************************************/
6892
6893/**
6894 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6895 */
6896static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6897{
6898 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6899 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6900 true /*exact / generate #PE */, &SoftState));
6901 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6902}
6903
6904
6905IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6906{
6907 uint16_t const fFcw = pFpuState->FCW;
6908 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6909
6910 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6911 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6912 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6913 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6914 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6915 || RTFLOAT80U_IS_INF(pr80Val))
6916 pFpuRes->r80Result = *pr80Val;
6917 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6918 {
6919 fFsw |= X86_FSW_DE;
6920 if (fFcw & X86_FCW_DM)
6921 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6922 else
6923 {
6924 pFpuRes->r80Result = *pr80Val;
6925 fFsw |= X86_FSW_ES | X86_FSW_B;
6926 }
6927 }
6928 else
6929 {
6930 if (fFcw & X86_FCW_IM)
6931 {
6932 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6933 pFpuRes->r80Result = g_r80Indefinite;
6934 else
6935 {
6936 pFpuRes->r80Result = *pr80Val;
6937 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6938 }
6939 }
6940 else
6941 {
6942 pFpuRes->r80Result = *pr80Val;
6943 fFsw |= X86_FSW_ES | X86_FSW_B;
6944 }
6945 fFsw |= X86_FSW_IE;
6946 }
6947 pFpuRes->FSW = fFsw;
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6953{
6954 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6955 it does everything we need it to do. */
6956 uint16_t const fFcw = pFpuState->FCW;
6957 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6958 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6959 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6960 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6961}
6962
6963
6964/**
6965 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6966 */
6967static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6968{
6969 Assert(!pr80Val->s.fSign);
6970 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6971 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6972 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6977{
6978 uint16_t const fFcw = pFpuState->FCW;
6979 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6980
6981 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6982 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6983 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6984 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6985 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6986 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6987 pFpuRes->r80Result = *pr80Val;
6988 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6989 {
6990 fFsw |= X86_FSW_DE;
6991 if (fFcw & X86_FCW_DM)
6992 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6993 else
6994 {
6995 pFpuRes->r80Result = *pr80Val;
6996 fFsw |= X86_FSW_ES | X86_FSW_B;
6997 }
6998 }
6999 else
7000 {
7001 if (fFcw & X86_FCW_IM)
7002 {
7003 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7004 pFpuRes->r80Result = g_r80Indefinite;
7005 else
7006 {
7007 pFpuRes->r80Result = *pr80Val;
7008 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7009 }
7010 }
7011 else
7012 {
7013 pFpuRes->r80Result = *pr80Val;
7014 fFsw |= X86_FSW_ES | X86_FSW_B;
7015 }
7016 fFsw |= X86_FSW_IE;
7017 }
7018 pFpuRes->FSW = fFsw;
7019}
7020
7021
7022/**
7023 * @code{.unparsed}
7024 * x x * ln2
7025 * f(x) = 2 - 1 = e - 1
7026 *
7027 * @endcode
7028 *
7029 * We can approximate e^x by a Taylor/Maclaurin series (see
7030 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7031 * @code{.unparsed}
7032 * n 0 1 2 3 4
7033 * inf x x x x x x
7034 * SUM ----- = --- + --- + --- + --- + --- + ...
7035 * n=0 n! 0! 1! 2! 3! 4!
7036 *
7037 * 2 3 4
7038 * x x x
7039 * = 1 + x + --- + --- + --- + ...
7040 * 2! 3! 4!
7041 * @endcode
7042 *
7043 * Given z = x * ln2, we get:
7044 * @code{.unparsed}
7045 * 2 3 4 n
7046 * z z z z z
7047 * e - 1 = z + --- + --- + --- + ... + ---
7048 * 2! 3! 4! n!
7049 * @endcode
7050 *
7051 * Wanting to use Horner's method, we move one z outside and get:
7052 * @code{.unparsed}
7053 * 2 3 (n-1)
7054 * z z z z
7055 * = z ( 1 + --- + --- + --- + ... + ------- )
7056 * 2! 3! 4! n!
7057 * @endcode
7058 *
7059 * The constants we need for using Horner's methods are 1 and 1 / n!.
7060 *
7061 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7062 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7063 * and can approximate it to be 1.0. For a visual demonstration of this
7064 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7065 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7066 *
7067 *
7068 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7069 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7070 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7071 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7072 * blocks). (The one bit difference is probably an implicit one missing from
7073 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7074 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7075 * exponent.
7076 *
7077 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7078 * successfully reproduced the exact results from an Intel 10980XE, there is
7079 * always a portition of rounding differences. Not going to spend too much time
7080 * on getting this 100% the same, at least not now.
7081 *
7082 * P.S. If someone are really curious about 8087 and its contstants:
7083 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7084 *
7085 *
7086 * @param pr80Val The exponent value (x), less than 1.0, greater than
7087 * -1.0 and not zero. This can be a normal, denormal
7088 * or pseudo-denormal value.
7089 * @param pr80Result Where to return the result.
7090 * @param fFcw FPU control word.
7091 * @param fFsw FPU status word.
7092 */
7093static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7094{
7095 /* As mentioned above, we can skip the expensive polynomial calculation
7096 as it will be close enough to 1.0 that it makes no difference.
7097
7098 The cutoff point for intel 10980XE is exponents >= -69. Intel
7099 also seems to be using a 67-bit or 68-bit constant value, and we get
7100 a smattering of rounding differences if we go for higher precision. */
7101 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7102 {
7103 RTUINT256U u256;
7104 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7105 u256.QWords.qw0 |= 1; /* force #PE */
7106 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7107 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7108 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7109 : 1 - RTFLOAT80U_EXP_BIAS,
7110 fFcw, fFsw);
7111 }
7112 else
7113 {
7114#ifdef IEM_WITH_FLOAT128_FOR_FPU
7115 /* This approach is not good enough for small values, we end up with zero. */
7116 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7117 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7118 _Float128 rd128Result = powf128(2.0L, rd128Val);
7119 rd128Result -= 1.0L;
7120 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7121 iemFpuF128RestoreRounding(fOldRounding);
7122
7123# else
7124 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7125 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7126
7127 /* As mentioned above, enforce 68-bit internal mantissa width to better
7128 match the Intel 10980XE results. */
7129 unsigned const cPrecision = 68;
7130
7131 /* first calculate z = x * ln2 */
7132 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7133 cPrecision);
7134
7135 /* Then do the polynomial evaluation. */
7136 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7137 cPrecision, &SoftState);
7138 r = f128_mul(z, r, &SoftState);
7139
7140 /* Output the result. */
7141 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7142# endif
7143 }
7144 return fFsw;
7145}
7146
7147
7148IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7149{
7150 uint16_t const fFcw = pFpuState->FCW;
7151 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7152
7153 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7154 {
7155 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7156 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7157 else
7158 {
7159 /* Special case:
7160 2^+1.0 - 1.0 = 1.0
7161 2^-1.0 - 1.0 = -0.5 */
7162 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7163 && pr80Val->s.uMantissa == RT_BIT_64(63))
7164 {
7165 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7166 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7167 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7168 }
7169 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7170 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7171 else
7172 pFpuRes->r80Result = *pr80Val;
7173 fFsw |= X86_FSW_PE;
7174 if (!(fFcw & X86_FCW_PM))
7175 fFsw |= X86_FSW_ES | X86_FSW_B;
7176 }
7177 }
7178 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7179 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7180 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7181 pFpuRes->r80Result = *pr80Val;
7182 else if (RTFLOAT80U_IS_INF(pr80Val))
7183 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7184 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7185 {
7186 fFsw |= X86_FSW_DE;
7187 if (fFcw & X86_FCW_DM)
7188 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7189 else
7190 {
7191 pFpuRes->r80Result = *pr80Val;
7192 fFsw |= X86_FSW_ES | X86_FSW_B;
7193 }
7194 }
7195 else
7196 {
7197 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7198 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7199 && (fFcw & X86_FCW_IM))
7200 pFpuRes->r80Result = g_r80Indefinite;
7201 else
7202 {
7203 pFpuRes->r80Result = *pr80Val;
7204 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7205 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7206 }
7207 fFsw |= X86_FSW_IE;
7208 if (!(fFcw & X86_FCW_IM))
7209 fFsw |= X86_FSW_ES | X86_FSW_B;
7210 }
7211 pFpuRes->FSW = fFsw;
7212}
7213
7214#endif /* IEM_WITHOUT_ASSEMBLY */
7215
7216IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7217{
7218 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7219}
7220
7221IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7222{
7223 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7224}
7225
7226#ifdef IEM_WITHOUT_ASSEMBLY
7227
7228IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7229{
7230 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7231 pFpuRes->r80Result = *pr80Val;
7232 pFpuRes->r80Result.s.fSign = 0;
7233}
7234
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7237{
7238 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7239 pFpuRes->r80Result = *pr80Val;
7240 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7241}
7242
7243
7244IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7245{
7246 uint16_t const fFcw = pFpuState->FCW;
7247 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7248
7249 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7250 {
7251 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7253
7254 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7255 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7256 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7257 }
7258 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7259 {
7260 fFsw |= X86_FSW_ZE;
7261 if (fFcw & X86_FCW_ZM)
7262 {
7263 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7264 pFpuResTwo->r80Result2 = *pr80Val;
7265 }
7266 else
7267 {
7268 pFpuResTwo->r80Result2 = *pr80Val;
7269 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7270 }
7271 }
7272 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7273 {
7274 fFsw |= X86_FSW_DE;
7275 if (fFcw & X86_FCW_DM)
7276 {
7277 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7278 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7279 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7280 int32_t iExponent = -16382;
7281 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7282 {
7283 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7284 iExponent--;
7285 }
7286
7287 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7288 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7289 }
7290 else
7291 {
7292 pFpuResTwo->r80Result2 = *pr80Val;
7293 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7294 }
7295 }
7296 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7297 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7298 {
7299 pFpuResTwo->r80Result1 = *pr80Val;
7300 pFpuResTwo->r80Result2 = *pr80Val;
7301 }
7302 else if (RTFLOAT80U_IS_INF(pr80Val))
7303 {
7304 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7305 pFpuResTwo->r80Result2 = *pr80Val;
7306 }
7307 else
7308 {
7309 if (fFcw & X86_FCW_IM)
7310 {
7311 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7312 pFpuResTwo->r80Result1 = g_r80Indefinite;
7313 else
7314 {
7315 pFpuResTwo->r80Result1 = *pr80Val;
7316 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7317 }
7318 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7319 }
7320 else
7321 {
7322 pFpuResTwo->r80Result2 = *pr80Val;
7323 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7324 }
7325 fFsw |= X86_FSW_IE;
7326 }
7327 pFpuResTwo->FSW = fFsw;
7328}
7329
7330
7331IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7332 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7333{
7334 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7335 AssertReleaseFailed();
7336}
7337
7338#endif /* IEM_WITHOUT_ASSEMBLY */
7339
7340IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7341 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7342{
7343 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7344}
7345
7346IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7347 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7348{
7349 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7350}
7351
7352#if defined(IEM_WITHOUT_ASSEMBLY)
7353
7354IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7355 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7356{
7357 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7358 AssertReleaseFailed();
7359}
7360
7361#endif /* IEM_WITHOUT_ASSEMBLY */
7362
7363IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7364 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7365{
7366 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7367}
7368
7369IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7370 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7371{
7372 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7373}
7374
7375
7376/*********************************************************************************************************************************
7377* MMX, SSE & AVX *
7378*********************************************************************************************************************************/
7379
7380/*
7381 * MOVSLDUP / VMOVSLDUP
7382 */
7383IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7384{
7385 puDst->au32[0] = puSrc->au32[0];
7386 puDst->au32[1] = puSrc->au32[0];
7387 puDst->au32[2] = puSrc->au32[2];
7388 puDst->au32[3] = puSrc->au32[2];
7389}
7390
7391#ifdef IEM_WITH_VEX
7392
7393IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7394{
7395 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7396 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7397 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7398 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7399 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7400 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7401 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7402 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7403}
7404
7405
7406IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7407{
7408 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7409 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7410 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7411 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7412 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7413 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7414 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7415 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7416}
7417
7418#endif /* IEM_WITH_VEX */
7419
7420
7421/*
7422 * MOVSHDUP / VMOVSHDUP
7423 */
7424IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7425{
7426 puDst->au32[0] = puSrc->au32[1];
7427 puDst->au32[1] = puSrc->au32[1];
7428 puDst->au32[2] = puSrc->au32[3];
7429 puDst->au32[3] = puSrc->au32[3];
7430}
7431
7432#ifdef IEM_WITH_VEX
7433
7434IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7435{
7436 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7437 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7438 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7439 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7440 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7441 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7442 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7443 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7444}
7445
7446
7447IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7448{
7449 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7450 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7451 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7452 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7453 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7454 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7455 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7456 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7457}
7458
7459#endif /* IEM_WITH_VEX */
7460
7461
7462/*
7463 * MOVDDUP / VMOVDDUP
7464 */
7465IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7466{
7467 puDst->au64[0] = uSrc;
7468 puDst->au64[1] = uSrc;
7469}
7470
7471#ifdef IEM_WITH_VEX
7472
7473IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7474{
7475 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7476 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7477 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7478 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7479}
7480
7481IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7482{
7483 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7484 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7485 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7486 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7487}
7488
7489#endif /* IEM_WITH_VEX */
7490
7491
7492/*
7493 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7494 */
7495#ifdef IEM_WITHOUT_ASSEMBLY
7496
7497IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7498{
7499 RT_NOREF(pFpuState);
7500 *puDst &= *puSrc;
7501}
7502
7503
7504IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7505{
7506 RT_NOREF(pFpuState);
7507 puDst->au64[0] &= puSrc->au64[0];
7508 puDst->au64[1] &= puSrc->au64[1];
7509}
7510
7511#endif
7512
7513IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7514 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7515{
7516 RT_NOREF(pExtState);
7517 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7518 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7519}
7520
7521
7522IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7523 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7524{
7525 RT_NOREF(pExtState);
7526 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7527 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7528 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7529 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7530}
7531
7532
7533/*
7534 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7535 */
7536#ifdef IEM_WITHOUT_ASSEMBLY
7537
7538IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7539{
7540 RT_NOREF(pFpuState);
7541 *puDst = ~*puDst & *puSrc;
7542}
7543
7544
7545IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7546{
7547 RT_NOREF(pFpuState);
7548 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7549 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7550}
7551
7552#endif
7553
7554IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7555 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7556{
7557 RT_NOREF(pExtState);
7558 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7559 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7560}
7561
7562
7563IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7564 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7565{
7566 RT_NOREF(pExtState);
7567 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7568 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7569 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7570 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7571}
7572
7573
7574/*
7575 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7576 */
7577#ifdef IEM_WITHOUT_ASSEMBLY
7578
7579IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7580{
7581 RT_NOREF(pFpuState);
7582 *puDst |= *puSrc;
7583}
7584
7585
7586IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7587{
7588 RT_NOREF(pFpuState);
7589 puDst->au64[0] |= puSrc->au64[0];
7590 puDst->au64[1] |= puSrc->au64[1];
7591}
7592
7593#endif
7594
7595IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7596 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7597{
7598 RT_NOREF(pExtState);
7599 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7600 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7601}
7602
7603
7604IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7605 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7606{
7607 RT_NOREF(pExtState);
7608 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7609 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7610 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7611 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7612}
7613
7614
7615/*
7616 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7617 */
7618#ifdef IEM_WITHOUT_ASSEMBLY
7619
7620IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7621{
7622 RT_NOREF(pFpuState);
7623 *puDst ^= *puSrc;
7624}
7625
7626
7627IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7628{
7629 RT_NOREF(pFpuState);
7630 puDst->au64[0] ^= puSrc->au64[0];
7631 puDst->au64[1] ^= puSrc->au64[1];
7632}
7633
7634#endif
7635
7636IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7637 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7638{
7639 RT_NOREF(pExtState);
7640 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7641 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7642}
7643
7644
7645IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7646 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7647{
7648 RT_NOREF(pExtState);
7649 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7650 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7651 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7652 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7653}
7654
7655
7656/*
7657 * PCMPEQB / VPCMPEQB
7658 */
7659#ifdef IEM_WITHOUT_ASSEMBLY
7660
7661IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7662{
7663 RT_NOREF(pFpuState);
7664 RTUINT64U uSrc1 = { *puDst };
7665 RTUINT64U uSrc2 = { *puSrc };
7666 RTUINT64U uDst;
7667 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7668 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7669 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7670 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7671 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7672 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7673 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7674 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7675 *puDst = uDst.u;
7676}
7677
7678
7679IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7680{
7681 RT_NOREF(pFpuState);
7682 RTUINT128U uSrc1 = *puDst;
7683 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7684 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7685 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7686 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7687 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7688 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7689 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7690 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7691 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7692 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7693 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7694 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7695 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7696 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7697 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7698 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7699}
7700
7701#endif
7702
7703IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7704 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7705{
7706 RT_NOREF(pExtState);
7707 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7708 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7709 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7710 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7711 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7712 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7713 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7714 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7715 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7716 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7717 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7718 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7719 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7720 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7721 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7722 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7723}
7724
7725IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7726 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7727{
7728 RT_NOREF(pExtState);
7729 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7730 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7731 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7732 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7733 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7734 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7735 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7736 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7737 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7738 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7739 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7740 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7741 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7742 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7743 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7744 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7745 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7746 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7747 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7748 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7749 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7750 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7751 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7752 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7753 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7754 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7755 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7756 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7757 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7758 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7759 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7760 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7761}
7762
7763
7764/*
7765 * PCMPEQW / VPCMPEQW
7766 */
7767#ifdef IEM_WITHOUT_ASSEMBLY
7768
7769IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7770{
7771 RT_NOREF(pFpuState);
7772 RTUINT64U uSrc1 = { *puDst };
7773 RTUINT64U uSrc2 = { *puSrc };
7774 RTUINT64U uDst;
7775 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7776 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7777 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7778 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7779 *puDst = uDst.u;
7780}
7781
7782
7783IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7784{
7785 RT_NOREF(pFpuState);
7786 RTUINT128U uSrc1 = *puDst;
7787 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7788 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7789 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7790 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7791 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7792 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7793 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7794 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7795}
7796
7797#endif
7798
7799IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7800 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7801{
7802 RT_NOREF(pExtState);
7803 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7804 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7805 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7806 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7807 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7808 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7809 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7810 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7811}
7812
7813IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7814 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7815{
7816 RT_NOREF(pExtState);
7817 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7818 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7819 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7820 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7821 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7822 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7823 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7824 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7825 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7826 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7827 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7828 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7829 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7830 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7831 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7832 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7833}
7834
7835
7836/*
7837 * PCMPEQD / VPCMPEQD.
7838 */
7839#ifdef IEM_WITHOUT_ASSEMBLY
7840
7841IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7842{
7843 RT_NOREF(pFpuState);
7844 RTUINT64U uSrc1 = { *puDst };
7845 RTUINT64U uSrc2 = { *puSrc };
7846 RTUINT64U uDst;
7847 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7848 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7849 *puDst = uDst.u;
7850}
7851
7852
7853IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7854{
7855 RT_NOREF(pFpuState);
7856 RTUINT128U uSrc1 = *puDst;
7857 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7858 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7859 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7860 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7861}
7862
7863#endif /* IEM_WITHOUT_ASSEMBLY */
7864
7865IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7866 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7867{
7868 RT_NOREF(pExtState);
7869 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7870 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7871 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7872 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7873}
7874
7875IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7876 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7877{
7878 RT_NOREF(pExtState);
7879 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7880 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7881 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7882 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7883 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7884 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7885 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7886 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7887}
7888
7889
7890/*
7891 * PCMPEQQ / VPCMPEQQ.
7892 */
7893IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7894{
7895 RT_NOREF(pFpuState);
7896 RTUINT128U uSrc1 = *puDst;
7897 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7898 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7899}
7900
7901IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7902 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7903{
7904 RT_NOREF(pExtState);
7905 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7906 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7907}
7908
7909IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7910 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7911{
7912 RT_NOREF(pExtState);
7913 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7914 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7915 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7916 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7917}
7918
7919
7920/*
7921 * PCMPGTB / VPCMPGTB
7922 */
7923#ifdef IEM_WITHOUT_ASSEMBLY
7924
7925IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7926{
7927 RT_NOREF(pFpuState);
7928 RTUINT64U uSrc1 = { *puDst };
7929 RTUINT64U uSrc2 = { *puSrc };
7930 RTUINT64U uDst;
7931 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7932 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7933 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7934 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7935 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7936 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7937 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7938 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7939 *puDst = uDst.u;
7940}
7941
7942
7943IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7944{
7945 RT_NOREF(pFpuState);
7946 RTUINT128U uSrc1 = *puDst;
7947 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7948 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7949 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7950 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7951 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7952 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7953 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7954 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7955 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7956 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7957 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7958 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7959 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7960 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7961 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7962 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7963}
7964
7965#endif
7966
7967IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7968 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7969{
7970 RT_NOREF(pExtState);
7971 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7972 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7973 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7974 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7975 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7976 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7977 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7978 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7979 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7980 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7981 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7982 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7983 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7984 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7985 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7986 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7987}
7988
7989IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7990 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7991{
7992 RT_NOREF(pExtState);
7993 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7994 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7995 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7996 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7997 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7998 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7999 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8000 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8001 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8002 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8003 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8004 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8005 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8006 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8007 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8008 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8009 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8010 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8011 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8012 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8013 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8014 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8015 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8016 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8017 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8018 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8019 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8020 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8021 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8022 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8023 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8024 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8025}
8026
8027
8028/*
8029 * PCMPGTW / VPCMPGTW
8030 */
8031#ifdef IEM_WITHOUT_ASSEMBLY
8032
8033IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8034{
8035 RT_NOREF(pFpuState);
8036 RTUINT64U uSrc1 = { *puDst };
8037 RTUINT64U uSrc2 = { *puSrc };
8038 RTUINT64U uDst;
8039 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8040 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8041 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8042 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8043 *puDst = uDst.u;
8044}
8045
8046
8047IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8048{
8049 RT_NOREF(pFpuState);
8050 RTUINT128U uSrc1 = *puDst;
8051 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8052 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8053 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8054 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8055 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8056 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8057 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8058 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8059}
8060
8061#endif
8062
8063IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8064 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8065{
8066 RT_NOREF(pExtState);
8067 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8068 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8069 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8070 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8071 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8072 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8073 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8074 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8075}
8076
8077IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8078 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8079{
8080 RT_NOREF(pExtState);
8081 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8082 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8083 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8084 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8085 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8086 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8087 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8088 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8089 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8090 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8091 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8092 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8093 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8094 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8095 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8096 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8097}
8098
8099
8100/*
8101 * PCMPGTD / VPCMPGTD.
8102 */
8103#ifdef IEM_WITHOUT_ASSEMBLY
8104
8105IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8106{
8107 RT_NOREF(pFpuState);
8108 RTUINT64U uSrc1 = { *puDst };
8109 RTUINT64U uSrc2 = { *puSrc };
8110 RTUINT64U uDst;
8111 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8112 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8113 *puDst = uDst.u;
8114}
8115
8116
8117IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8118{
8119 RT_NOREF(pFpuState);
8120 RTUINT128U uSrc1 = *puDst;
8121 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8122 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8123 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8124 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8125}
8126
8127#endif /* IEM_WITHOUT_ASSEMBLY */
8128
8129IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8130 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8131{
8132 RT_NOREF(pExtState);
8133 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8134 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8135 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8136 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8137}
8138
8139IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8140 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8141{
8142 RT_NOREF(pExtState);
8143 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8144 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8145 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8146 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8147 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8148 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8149 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8150 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8151}
8152
8153
8154/*
8155 * PCMPGTQ / VPCMPGTQ.
8156 */
8157IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8158{
8159 RT_NOREF(pFpuState);
8160 RTUINT128U uSrc1 = *puDst;
8161 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8162 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8163}
8164
8165IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8166 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8167{
8168 RT_NOREF(pExtState);
8169 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8170 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8171}
8172
8173IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8174 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8175{
8176 RT_NOREF(pExtState);
8177 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8178 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8179 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8180 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8181}
8182
8183
8184/*
8185 * PADDB / VPADDB
8186 */
8187#ifdef IEM_WITHOUT_ASSEMBLY
8188
8189IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8190{
8191 RT_NOREF(pFpuState);
8192 RTUINT64U uSrc1 = { *puDst };
8193 RTUINT64U uSrc2 = { *puSrc };
8194 RTUINT64U uDst;
8195 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8196 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8197 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8198 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8199 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8200 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8201 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8202 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8203 *puDst = uDst.u;
8204}
8205
8206
8207IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8208{
8209 RT_NOREF(pFpuState);
8210 RTUINT128U uSrc1 = *puDst;
8211 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8212 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8213 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8214 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8215 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8216 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8217 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8218 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8219 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8220 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8221 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8222 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8223 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8224 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8225 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8226 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8227}
8228
8229#endif
8230
8231
8232IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8233 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8234{
8235 RT_NOREF(pExtState);
8236 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8237 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8238 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8239 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8240 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8241 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8242 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8243 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8244 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8245 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8246 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8247 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8248 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8249 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8250 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8251 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8252}
8253
8254IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8255 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8256{
8257 RT_NOREF(pExtState);
8258 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8259 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8260 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8261 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8262 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8263 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8264 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8265 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8266 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8267 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8268 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8269 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8270 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8271 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8272 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8273 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8274 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8275 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8276 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8277 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8278 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8279 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8280 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8281 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8282 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8283 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8284 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8285 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8286 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8287 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8288 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8289 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8290}
8291
8292
8293/*
8294 * PADDSB / VPADDSB
8295 */
8296#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8297 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8298 ? (uint8_t)(a_iWord) \
8299 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8300
8301#ifdef IEM_WITHOUT_ASSEMBLY
8302
8303IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8304{
8305 RT_NOREF(pFpuState);
8306 RTUINT64U uSrc1 = { *puDst };
8307 RTUINT64U uSrc2 = { *puSrc };
8308 RTUINT64U uDst;
8309 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8310 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8311 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8312 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8313 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8314 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8315 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8316 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8317 *puDst = uDst.u;
8318}
8319
8320
8321IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8322{
8323 RT_NOREF(pFpuState);
8324 RTUINT128U uSrc1 = *puDst;
8325 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8326 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8327 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8328 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8329 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8330 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8331 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8332 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8333 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8334 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8335 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8336 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8337 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8338 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8339 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8340 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8341}
8342
8343#endif
8344
8345
8346/*
8347 * PADDSB / VPADDSB
8348 */
8349#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8350 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8351 ? (uint8_t)(a_uWord) \
8352 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8353
8354#ifdef IEM_WITHOUT_ASSEMBLY
8355
8356IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8357{
8358 RT_NOREF(pFpuState);
8359 RTUINT64U uSrc1 = { *puDst };
8360 RTUINT64U uSrc2 = { *puSrc };
8361 RTUINT64U uDst;
8362 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8363 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8364 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8365 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8366 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8367 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8368 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8369 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8370 *puDst = uDst.u;
8371}
8372
8373
8374IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8375{
8376 RT_NOREF(pFpuState);
8377 RTUINT128U uSrc1 = *puDst;
8378 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8379 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8380 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8381 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8382 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8383 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8384 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8385 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8386 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8387 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8388 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8389 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8390 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8391 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8392 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8393 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8394}
8395
8396#endif
8397
8398
8399/*
8400 * PADDW / VPADDW
8401 */
8402#ifdef IEM_WITHOUT_ASSEMBLY
8403
8404IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8405{
8406 RT_NOREF(pFpuState);
8407 RTUINT64U uSrc1 = { *puDst };
8408 RTUINT64U uSrc2 = { *puSrc };
8409 RTUINT64U uDst;
8410 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8411 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8412 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8413 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8414 *puDst = uDst.u;
8415}
8416
8417
8418IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8419{
8420 RT_NOREF(pFpuState);
8421 RTUINT128U uSrc1 = *puDst;
8422 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8423 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8424 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8425 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8426 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8427 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8428 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8429 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8430}
8431
8432#endif
8433
8434
8435IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8436 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8437{
8438 RT_NOREF(pExtState);
8439 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8440 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8441 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8442 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8443 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8444 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8445 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8446 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8447}
8448
8449IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8450 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8451{
8452 RT_NOREF(pExtState);
8453 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8454 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8455 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8456 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8457 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8458 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8459 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8460 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8461 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8462 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8463 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8464 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8465 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8466 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8467 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8468 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8469}
8470
8471
8472/*
8473 * PADDSW / VPADDSW
8474 */
8475#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8476 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8477 ? (uint16_t)(a_iDword) \
8478 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8479
8480#ifdef IEM_WITHOUT_ASSEMBLY
8481
8482IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8483{
8484 RT_NOREF(pFpuState);
8485 RTUINT64U uSrc1 = { *puDst };
8486 RTUINT64U uSrc2 = { *puSrc };
8487 RTUINT64U uDst;
8488 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8489 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8490 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8491 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8492 *puDst = uDst.u;
8493}
8494
8495
8496IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8497{
8498 RT_NOREF(pFpuState);
8499 RTUINT128U uSrc1 = *puDst;
8500 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8501 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8502 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8503 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8504 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8505 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8506 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8507 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8508}
8509
8510#endif
8511
8512
8513/*
8514 * PADDUSW / VPADDUSW
8515 */
8516#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8517 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8518 ? (uint16_t)(a_uDword) \
8519 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8520
8521#ifdef IEM_WITHOUT_ASSEMBLY
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8524{
8525 RT_NOREF(pFpuState);
8526 RTUINT64U uSrc1 = { *puDst };
8527 RTUINT64U uSrc2 = { *puSrc };
8528 RTUINT64U uDst;
8529 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8530 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8531 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8532 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8533 *puDst = uDst.u;
8534}
8535
8536
8537IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8538{
8539 RT_NOREF(pFpuState);
8540 RTUINT128U uSrc1 = *puDst;
8541 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8542 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8543 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8544 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8545 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8546 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8547 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8548 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8549}
8550
8551#endif
8552
8553
8554/*
8555 * PADDD / VPADDD.
8556 */
8557#ifdef IEM_WITHOUT_ASSEMBLY
8558
8559IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8560{
8561 RT_NOREF(pFpuState);
8562 RTUINT64U uSrc1 = { *puDst };
8563 RTUINT64U uSrc2 = { *puSrc };
8564 RTUINT64U uDst;
8565 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8566 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8567 *puDst = uDst.u;
8568}
8569
8570
8571IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8572{
8573 RT_NOREF(pFpuState);
8574 RTUINT128U uSrc1 = *puDst;
8575 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8576 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8577 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8578 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8579}
8580
8581#endif /* IEM_WITHOUT_ASSEMBLY */
8582
8583IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8584 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8585{
8586 RT_NOREF(pExtState);
8587 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8588 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8589 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8590 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8591}
8592
8593IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8594 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8595{
8596 RT_NOREF(pExtState);
8597 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8598 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8599 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8600 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8601 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8602 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8603 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8604 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8605}
8606
8607
8608/*
8609 * PADDQ / VPADDQ.
8610 */
8611#ifdef IEM_WITHOUT_ASSEMBLY
8612
8613IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8614{
8615 RT_NOREF(pFpuState);
8616 *puDst = *puDst + *puSrc;
8617}
8618
8619IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8620{
8621 RT_NOREF(pFpuState);
8622 RTUINT128U uSrc1 = *puDst;
8623 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8624 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8625}
8626
8627#endif
8628
8629IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8630 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8631{
8632 RT_NOREF(pExtState);
8633 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8634 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8635}
8636
8637IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8638 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8639{
8640 RT_NOREF(pExtState);
8641 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8642 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8643 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8644 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8645}
8646
8647
8648/*
8649 * PSUBB / VPSUBB
8650 */
8651#ifdef IEM_WITHOUT_ASSEMBLY
8652
8653IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8654{
8655 RT_NOREF(pFpuState);
8656 RTUINT64U uSrc1 = { *puDst };
8657 RTUINT64U uSrc2 = { *puSrc };
8658 RTUINT64U uDst;
8659 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8660 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8661 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8662 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8663 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8664 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8665 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8666 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8667 *puDst = uDst.u;
8668}
8669
8670
8671IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8672{
8673 RT_NOREF(pFpuState);
8674 RTUINT128U uSrc1 = *puDst;
8675 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8676 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8677 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8678 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8679 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8680 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8681 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8682 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8683 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8684 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8685 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8686 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8687 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8688 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8689 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8690 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8691}
8692
8693#endif
8694
8695IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8696 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8697{
8698 RT_NOREF(pExtState);
8699 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8700 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8701 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8702 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8703 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8704 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8705 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8706 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8707 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8708 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8709 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8710 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8711 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8712 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8713 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8714 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8715}
8716
8717IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8718 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8719{
8720 RT_NOREF(pExtState);
8721 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8722 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8723 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8724 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8725 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8726 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8727 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8728 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8729 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8730 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8731 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8732 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8733 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8734 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8735 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8736 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8737 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8738 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8739 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8740 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8741 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8742 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8743 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8744 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8745 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8746 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8747 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8748 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8749 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8750 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8751 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8752 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8753}
8754
8755
8756/*
8757 * PSUBSB / VSUBSB
8758 */
8759#ifdef IEM_WITHOUT_ASSEMBLY
8760
8761IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8762{
8763 RT_NOREF(pFpuState);
8764 RTUINT64U uSrc1 = { *puDst };
8765 RTUINT64U uSrc2 = { *puSrc };
8766 RTUINT64U uDst;
8767 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8768 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8769 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8770 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8771 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8772 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8773 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8774 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8775 *puDst = uDst.u;
8776}
8777
8778
8779IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8780{
8781 RT_NOREF(pFpuState);
8782 RTUINT128U uSrc1 = *puDst;
8783 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8784 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8785 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8786 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8787 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8788 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8789 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8790 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8791 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8792 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8793 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8794 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8795 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8796 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8797 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8798 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8799}
8800
8801#endif
8802
8803
8804/*
8805 * PADDSB / VPADDSB
8806 */
8807#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8808 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8809 ? (uint8_t)(a_uWord) \
8810 : (uint8_t)0 )
8811
8812#ifdef IEM_WITHOUT_ASSEMBLY
8813
8814IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8815{
8816 RT_NOREF(pFpuState);
8817 RTUINT64U uSrc1 = { *puDst };
8818 RTUINT64U uSrc2 = { *puSrc };
8819 RTUINT64U uDst;
8820 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8821 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8822 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8823 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8824 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8825 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8826 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8827 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8828 *puDst = uDst.u;
8829}
8830
8831
8832IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8833{
8834 RT_NOREF(pFpuState);
8835 RTUINT128U uSrc1 = *puDst;
8836 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8837 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8838 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8839 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8840 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8841 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8842 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8843 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8844 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8845 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8846 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8847 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8848 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8849 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8850 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8851 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8852}
8853
8854#endif
8855
8856
8857/*
8858 * PSUBW / VPSUBW
8859 */
8860#ifdef IEM_WITHOUT_ASSEMBLY
8861
8862IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8863{
8864 RT_NOREF(pFpuState);
8865 RTUINT64U uSrc1 = { *puDst };
8866 RTUINT64U uSrc2 = { *puSrc };
8867 RTUINT64U uDst;
8868 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8869 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8870 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8871 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8872 *puDst = uDst.u;
8873}
8874
8875
8876IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8877{
8878 RT_NOREF(pFpuState);
8879 RTUINT128U uSrc1 = *puDst;
8880 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8881 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8882 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8883 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8884 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8885 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8886 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8887 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8888}
8889
8890#endif
8891
8892IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8893 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8894{
8895 RT_NOREF(pExtState);
8896 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8897 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8898 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8899 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8900 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8901 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8902 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8903 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8904}
8905
8906IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8907 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8908{
8909 RT_NOREF(pExtState);
8910 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8911 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8912 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8913 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8914 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8915 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8916 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8917 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8918 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8919 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8920 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8921 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8922 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8923 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8924 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8925 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8926}
8927
8928
8929/*
8930 * PSUBSW / VPSUBSW
8931 */
8932#ifdef IEM_WITHOUT_ASSEMBLY
8933
8934IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8935{
8936 RT_NOREF(pFpuState);
8937 RTUINT64U uSrc1 = { *puDst };
8938 RTUINT64U uSrc2 = { *puSrc };
8939 RTUINT64U uDst;
8940 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
8941 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
8942 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
8943 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
8944 *puDst = uDst.u;
8945}
8946
8947
8948IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8949{
8950 RT_NOREF(pFpuState);
8951 RTUINT128U uSrc1 = *puDst;
8952 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
8953 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
8954 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
8955 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
8956 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
8957 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
8958 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
8959 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
8960}
8961
8962#endif
8963
8964
8965/*
8966 * PSUBUSW / VPSUBUSW
8967 */
8968#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
8969 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8970 ? (uint16_t)(a_uDword) \
8971 : (uint16_t)0 )
8972
8973#ifdef IEM_WITHOUT_ASSEMBLY
8974
8975IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8976{
8977 RT_NOREF(pFpuState);
8978 RTUINT64U uSrc1 = { *puDst };
8979 RTUINT64U uSrc2 = { *puSrc };
8980 RTUINT64U uDst;
8981 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
8982 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
8983 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
8984 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
8985 *puDst = uDst.u;
8986}
8987
8988
8989IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8990{
8991 RT_NOREF(pFpuState);
8992 RTUINT128U uSrc1 = *puDst;
8993 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
8994 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
8995 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
8996 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
8997 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
8998 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
8999 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9000 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9001}
9002
9003#endif
9004
9005
9006/*
9007 * PSUBD / VPSUBD.
9008 */
9009#ifdef IEM_WITHOUT_ASSEMBLY
9010
9011IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9012{
9013 RT_NOREF(pFpuState);
9014 RTUINT64U uSrc1 = { *puDst };
9015 RTUINT64U uSrc2 = { *puSrc };
9016 RTUINT64U uDst;
9017 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9018 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9019 *puDst = uDst.u;
9020}
9021
9022
9023IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9024{
9025 RT_NOREF(pFpuState);
9026 RTUINT128U uSrc1 = *puDst;
9027 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9028 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9029 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9030 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9031}
9032
9033#endif /* IEM_WITHOUT_ASSEMBLY */
9034
9035IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9036 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9037{
9038 RT_NOREF(pExtState);
9039 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9040 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9041 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9042 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9043}
9044
9045IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9046 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9047{
9048 RT_NOREF(pExtState);
9049 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9050 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9051 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9052 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9053 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9054 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9055 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9056 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9057}
9058
9059
9060/*
9061 * PSUBQ / VPSUBQ.
9062 */
9063#ifdef IEM_WITHOUT_ASSEMBLY
9064
9065IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9066{
9067 RT_NOREF(pFpuState);
9068 *puDst = *puDst - *puSrc;
9069}
9070
9071IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9072{
9073 RT_NOREF(pFpuState);
9074 RTUINT128U uSrc1 = *puDst;
9075 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9076 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9077}
9078
9079#endif
9080
9081IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9082 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9083{
9084 RT_NOREF(pExtState);
9085 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9086 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9087}
9088
9089IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9090 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9091{
9092 RT_NOREF(pExtState);
9093 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9094 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9095 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9096 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9097}
9098
9099
9100
9101/*
9102 * PMULLW / VPMULLW / PMULLD / VPMULLD
9103 */
9104#ifdef IEM_WITHOUT_ASSEMBLY
9105
9106IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9107{
9108 RT_NOREF(pFpuState);
9109 RTUINT64U uSrc1 = { *puDst };
9110 RTUINT64U uSrc2 = { *puSrc };
9111 RTUINT64U uDst;
9112 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9113 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9114 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9115 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9116 *puDst = uDst.u;
9117}
9118
9119
9120IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9121{
9122 RT_NOREF(pFpuState);
9123 RTUINT128U uSrc1 = *puDst;
9124 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9125 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9126 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9127 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9128 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9129 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9130 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9131 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9132}
9133
9134#endif
9135
9136IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9137{
9138 RTUINT128U uSrc1 = *puDst;
9139
9140 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9141 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9142 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9143 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9144 RT_NOREF(pFpuState);
9145}
9146
9147
9148IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9149{
9150 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9151 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9152 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9153 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9154 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9155 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9156 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9157 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9158}
9159
9160
9161IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9162{
9163 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9164 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9165 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9166 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9167 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9168 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9169 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9170 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9171 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9172 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9173 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9174 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9175 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9176 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9177 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9178 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9179}
9180
9181
9182IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9183{
9184 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9185 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9186 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9187 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9188}
9189
9190
9191IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9192{
9193 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9194 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9195 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9196 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9197 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9198 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9199 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9200 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9201}
9202
9203
9204/*
9205 * PMULHW / VPMULHW
9206 */
9207#ifdef IEM_WITHOUT_ASSEMBLY
9208
9209IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9210{
9211 RT_NOREF(pFpuState);
9212 RTUINT64U uSrc1 = { *puDst };
9213 RTUINT64U uSrc2 = { *puSrc };
9214 RTUINT64U uDst;
9215 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9216 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9217 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9218 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9219 *puDst = uDst.u;
9220}
9221
9222
9223IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9224{
9225 RT_NOREF(pFpuState);
9226 RTUINT128U uSrc1 = *puDst;
9227 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9228 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9229 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9230 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9231 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9232 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9233 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9234 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9235}
9236
9237#endif
9238
9239IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9240{
9241 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9242 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9243 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9244 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9245 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9246 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9247 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9248 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9249}
9250
9251
9252IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9253{
9254 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9255 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9256 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9257 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9258 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9259 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9260 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9261 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9262 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9263 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9264 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9265 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9266 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9267 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9268 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9269 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9270}
9271
9272
9273/*
9274 * PMULHUW / VPMULHUW
9275 */
9276#ifdef IEM_WITHOUT_ASSEMBLY
9277
9278IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9279{
9280 RTUINT64U uSrc1 = { *puDst };
9281 RTUINT64U uSrc2 = { *puSrc };
9282 RTUINT64U uDst;
9283 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9284 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9285 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9286 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9287 *puDst = uDst.u;
9288}
9289
9290
9291IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9292{
9293 RTUINT128U uSrc1 = *puDst;
9294 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9295 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9296 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9297 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9298 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9299 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9300 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9301 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9302}
9303
9304#endif
9305
9306IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9307{
9308 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9309 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9310 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9311 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9312 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9313 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9314 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9315 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9316}
9317
9318
9319IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9320{
9321 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9322 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9323 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9324 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9325 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9326 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9327 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9328 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9329 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9330 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9331 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9332 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9333 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9334 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9335 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9336 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9337}
9338
9339
9340/*
9341 * PSRLW / VPSRLW
9342 */
9343#ifdef IEM_WITHOUT_ASSEMBLY
9344
9345IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9346{
9347 RTUINT64U uSrc1 = { *puDst };
9348 RTUINT64U uSrc2 = { *puSrc };
9349 RTUINT64U uDst;
9350
9351 if (uSrc2.au64[0] <= 15)
9352 {
9353 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9354 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9355 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9356 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9357 }
9358 else
9359 {
9360 uDst.au64[0] = 0;
9361 }
9362 *puDst = uDst.u;
9363}
9364
9365
9366IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9367{
9368 RTUINT64U uSrc1 = { *puDst };
9369 RTUINT64U uDst;
9370
9371 if (uShift <= 15)
9372 {
9373 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9374 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9375 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9376 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9377 }
9378 else
9379 {
9380 uDst.au64[0] = 0;
9381 }
9382 *puDst = uDst.u;
9383}
9384
9385
9386IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9387{
9388 RTUINT128U uSrc1 = *puDst;
9389
9390 if (puSrc->au64[0] <= 15)
9391 {
9392 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9393 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9394 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9395 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9396 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9397 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9398 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9399 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9400 }
9401 else
9402 {
9403 puDst->au64[0] = 0;
9404 puDst->au64[1] = 0;
9405 }
9406}
9407
9408IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9409{
9410 RTUINT128U uSrc1 = *puDst;
9411
9412 if (uShift <= 15)
9413 {
9414 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9415 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9416 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9417 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9418 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9419 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9420 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9421 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9422 }
9423 else
9424 {
9425 puDst->au64[0] = 0;
9426 puDst->au64[1] = 0;
9427 }
9428}
9429
9430#endif
9431
9432
9433/*
9434 * PSRAW / VPSRAW
9435 */
9436#ifdef IEM_WITHOUT_ASSEMBLY
9437
9438IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9439{
9440 RTUINT64U uSrc1 = { *puDst };
9441 RTUINT64U uSrc2 = { *puSrc };
9442 RTUINT64U uDst;
9443
9444 if (uSrc2.au64[0] <= 15)
9445 {
9446 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9447 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9448 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9449 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9450 }
9451 else
9452 {
9453 uDst.au64[0] = 0;
9454 }
9455 *puDst = uDst.u;
9456}
9457
9458
9459IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9460{
9461 RTUINT64U uSrc1 = { *puDst };
9462 RTUINT64U uDst;
9463
9464 if (uShift <= 15)
9465 {
9466 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9467 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9468 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9469 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9470 }
9471 else
9472 {
9473 uDst.au64[0] = 0;
9474 }
9475 *puDst = uDst.u;
9476}
9477
9478
9479IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9480{
9481 RTUINT128U uSrc1 = *puDst;
9482
9483 if (puSrc->au64[0] <= 15)
9484 {
9485 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9486 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9487 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9488 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9489 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9490 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9491 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9492 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9493 }
9494 else
9495 {
9496 puDst->au64[0] = 0;
9497 puDst->au64[1] = 0;
9498 }
9499}
9500
9501IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9502{
9503 RTUINT128U uSrc1 = *puDst;
9504
9505 if (uShift <= 15)
9506 {
9507 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9508 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9509 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9510 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9511 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9512 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9513 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9514 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9515 }
9516 else
9517 {
9518 puDst->au64[0] = 0;
9519 puDst->au64[1] = 0;
9520 }
9521}
9522
9523#endif
9524
9525
9526/*
9527 * PSLLW / VPSLLW
9528 */
9529#ifdef IEM_WITHOUT_ASSEMBLY
9530
9531IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9532{
9533 RTUINT64U uSrc1 = { *puDst };
9534 RTUINT64U uSrc2 = { *puSrc };
9535 RTUINT64U uDst;
9536
9537 if (uSrc2.au64[0] <= 15)
9538 {
9539 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9540 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9541 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9542 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9543 }
9544 else
9545 {
9546 uDst.au64[0] = 0;
9547 }
9548 *puDst = uDst.u;
9549}
9550
9551
9552IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9553{
9554 RTUINT64U uSrc1 = { *puDst };
9555 RTUINT64U uDst;
9556
9557 if (uShift <= 15)
9558 {
9559 uDst.au16[0] = uSrc1.au16[0] << uShift;
9560 uDst.au16[1] = uSrc1.au16[1] << uShift;
9561 uDst.au16[2] = uSrc1.au16[2] << uShift;
9562 uDst.au16[3] = uSrc1.au16[3] << uShift;
9563 }
9564 else
9565 {
9566 uDst.au64[0] = 0;
9567 }
9568 *puDst = uDst.u;
9569}
9570
9571
9572IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9573{
9574 RTUINT128U uSrc1 = *puDst;
9575
9576 if (puSrc->au64[0] <= 15)
9577 {
9578 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9579 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9580 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9581 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9582 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9583 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9584 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9585 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9586 }
9587 else
9588 {
9589 puDst->au64[0] = 0;
9590 puDst->au64[1] = 0;
9591 }
9592}
9593
9594IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9595{
9596 RTUINT128U uSrc1 = *puDst;
9597
9598 if (uShift <= 15)
9599 {
9600 puDst->au16[0] = uSrc1.au16[0] << uShift;
9601 puDst->au16[1] = uSrc1.au16[1] << uShift;
9602 puDst->au16[2] = uSrc1.au16[2] << uShift;
9603 puDst->au16[3] = uSrc1.au16[3] << uShift;
9604 puDst->au16[4] = uSrc1.au16[4] << uShift;
9605 puDst->au16[5] = uSrc1.au16[5] << uShift;
9606 puDst->au16[6] = uSrc1.au16[6] << uShift;
9607 puDst->au16[7] = uSrc1.au16[7] << uShift;
9608 }
9609 else
9610 {
9611 puDst->au64[0] = 0;
9612 puDst->au64[1] = 0;
9613 }
9614}
9615
9616#endif
9617
9618
9619/*
9620 * PSRLD / VPSRLD
9621 */
9622#ifdef IEM_WITHOUT_ASSEMBLY
9623
9624IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9625{
9626 RTUINT64U uSrc1 = { *puDst };
9627 RTUINT64U uSrc2 = { *puSrc };
9628 RTUINT64U uDst;
9629
9630 if (uSrc2.au64[0] <= 31)
9631 {
9632 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9633 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9634 }
9635 else
9636 {
9637 uDst.au64[0] = 0;
9638 }
9639 *puDst = uDst.u;
9640}
9641
9642
9643IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9644{
9645 RTUINT64U uSrc1 = { *puDst };
9646 RTUINT64U uDst;
9647
9648 if (uShift <= 31)
9649 {
9650 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9651 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9652 }
9653 else
9654 {
9655 uDst.au64[0] = 0;
9656 }
9657 *puDst = uDst.u;
9658}
9659
9660
9661IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9662{
9663 RTUINT128U uSrc1 = *puDst;
9664
9665 if (puSrc->au64[0] <= 31)
9666 {
9667 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9668 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9669 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9670 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9671 }
9672 else
9673 {
9674 puDst->au64[0] = 0;
9675 puDst->au64[1] = 0;
9676 }
9677}
9678
9679IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9680{
9681 RTUINT128U uSrc1 = *puDst;
9682
9683 if (uShift <= 31)
9684 {
9685 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9686 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9687 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9688 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9689 }
9690 else
9691 {
9692 puDst->au64[0] = 0;
9693 puDst->au64[1] = 0;
9694 }
9695}
9696
9697#endif
9698
9699
9700/*
9701 * PSRAD / VPSRAD
9702 */
9703#ifdef IEM_WITHOUT_ASSEMBLY
9704
9705IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9706{
9707 RTUINT64U uSrc1 = { *puDst };
9708 RTUINT64U uSrc2 = { *puSrc };
9709 RTUINT64U uDst;
9710
9711 if (uSrc2.au64[0] <= 31)
9712 {
9713 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9714 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9715 }
9716 else
9717 {
9718 uDst.au64[0] = 0;
9719 }
9720 *puDst = uDst.u;
9721}
9722
9723
9724IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9725{
9726 RTUINT64U uSrc1 = { *puDst };
9727 RTUINT64U uDst;
9728
9729 if (uShift <= 31)
9730 {
9731 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9732 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9733 }
9734 else
9735 {
9736 uDst.au64[0] = 0;
9737 }
9738 *puDst = uDst.u;
9739}
9740
9741
9742IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9743{
9744 RTUINT128U uSrc1 = *puDst;
9745
9746 if (puSrc->au64[0] <= 31)
9747 {
9748 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9749 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9750 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9751 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9752 }
9753 else
9754 {
9755 puDst->au64[0] = 0;
9756 puDst->au64[1] = 0;
9757 }
9758}
9759
9760IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9761{
9762 RTUINT128U uSrc1 = *puDst;
9763
9764 if (uShift <= 31)
9765 {
9766 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9767 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9768 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9769 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9770 }
9771 else
9772 {
9773 puDst->au64[0] = 0;
9774 puDst->au64[1] = 0;
9775 }
9776}
9777
9778#endif
9779
9780
9781/*
9782 * PSLLD / VPSLLD
9783 */
9784#ifdef IEM_WITHOUT_ASSEMBLY
9785
9786IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9787{
9788 RTUINT64U uSrc1 = { *puDst };
9789 RTUINT64U uSrc2 = { *puSrc };
9790 RTUINT64U uDst;
9791
9792 if (uSrc2.au64[0] <= 31)
9793 {
9794 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9795 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9796 }
9797 else
9798 {
9799 uDst.au64[0] = 0;
9800 }
9801 *puDst = uDst.u;
9802}
9803
9804
9805IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9806{
9807 RTUINT64U uSrc1 = { *puDst };
9808 RTUINT64U uDst;
9809
9810 if (uShift <= 31)
9811 {
9812 uDst.au32[0] = uSrc1.au32[0] << uShift;
9813 uDst.au32[1] = uSrc1.au32[1] << uShift;
9814 }
9815 else
9816 {
9817 uDst.au64[0] = 0;
9818 }
9819 *puDst = uDst.u;
9820}
9821
9822
9823IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9824{
9825 RTUINT128U uSrc1 = *puDst;
9826
9827 if (puSrc->au64[0] <= 31)
9828 {
9829 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9830 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9831 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9832 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9833 }
9834 else
9835 {
9836 puDst->au64[0] = 0;
9837 puDst->au64[1] = 0;
9838 }
9839}
9840
9841IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9842{
9843 RTUINT128U uSrc1 = *puDst;
9844
9845 if (uShift <= 31)
9846 {
9847 puDst->au32[0] = uSrc1.au32[0] << uShift;
9848 puDst->au32[1] = uSrc1.au32[1] << uShift;
9849 puDst->au32[2] = uSrc1.au32[2] << uShift;
9850 puDst->au32[3] = uSrc1.au32[3] << uShift;
9851 }
9852 else
9853 {
9854 puDst->au64[0] = 0;
9855 puDst->au64[1] = 0;
9856 }
9857}
9858
9859#endif
9860
9861
9862/*
9863 * PSRLQ / VPSRLQ
9864 */
9865#ifdef IEM_WITHOUT_ASSEMBLY
9866
9867IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9868{
9869 RTUINT64U uSrc1 = { *puDst };
9870 RTUINT64U uSrc2 = { *puSrc };
9871 RTUINT64U uDst;
9872
9873 if (uSrc2.au64[0] <= 63)
9874 {
9875 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9876 }
9877 else
9878 {
9879 uDst.au64[0] = 0;
9880 }
9881 *puDst = uDst.u;
9882}
9883
9884
9885IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9886{
9887 RTUINT64U uSrc1 = { *puDst };
9888 RTUINT64U uDst;
9889
9890 if (uShift <= 63)
9891 {
9892 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9893 }
9894 else
9895 {
9896 uDst.au64[0] = 0;
9897 }
9898 *puDst = uDst.u;
9899}
9900
9901
9902IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9903{
9904 RTUINT128U uSrc1 = *puDst;
9905
9906 if (puSrc->au64[0] <= 63)
9907 {
9908 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9909 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9910 }
9911 else
9912 {
9913 puDst->au64[0] = 0;
9914 puDst->au64[1] = 0;
9915 }
9916}
9917
9918IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9919{
9920 RTUINT128U uSrc1 = *puDst;
9921
9922 if (uShift <= 63)
9923 {
9924 puDst->au64[0] = uSrc1.au64[0] >> uShift;
9925 puDst->au64[1] = uSrc1.au64[1] >> uShift;
9926 }
9927 else
9928 {
9929 puDst->au64[0] = 0;
9930 puDst->au64[1] = 0;
9931 }
9932}
9933
9934#endif
9935
9936
9937/*
9938 * PSLLQ / VPSLLQ
9939 */
9940#ifdef IEM_WITHOUT_ASSEMBLY
9941
9942IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9943{
9944 RTUINT64U uSrc1 = { *puDst };
9945 RTUINT64U uSrc2 = { *puSrc };
9946 RTUINT64U uDst;
9947
9948 if (uSrc2.au64[0] <= 63)
9949 {
9950 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
9951 }
9952 else
9953 {
9954 uDst.au64[0] = 0;
9955 }
9956 *puDst = uDst.u;
9957}
9958
9959
9960IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9961{
9962 RTUINT64U uSrc1 = { *puDst };
9963 RTUINT64U uDst;
9964
9965 if (uShift <= 63)
9966 {
9967 uDst.au64[0] = uSrc1.au64[0] << uShift;
9968 }
9969 else
9970 {
9971 uDst.au64[0] = 0;
9972 }
9973 *puDst = uDst.u;
9974}
9975
9976
9977IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9978{
9979 RTUINT128U uSrc1 = *puDst;
9980
9981 if (puSrc->au64[0] <= 63)
9982 {
9983 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
9984 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
9985 }
9986 else
9987 {
9988 puDst->au64[0] = 0;
9989 puDst->au64[1] = 0;
9990 }
9991}
9992
9993IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9994{
9995 RTUINT128U uSrc1 = *puDst;
9996
9997 if (uShift <= 63)
9998 {
9999 puDst->au64[0] = uSrc1.au64[0] << uShift;
10000 puDst->au64[1] = uSrc1.au64[1] << uShift;
10001 }
10002 else
10003 {
10004 puDst->au64[0] = 0;
10005 puDst->au64[1] = 0;
10006 }
10007}
10008
10009#endif
10010
10011
10012/*
10013 * PSRLDQ / VPSRLDQ
10014 */
10015#ifdef IEM_WITHOUT_ASSEMBLY
10016
10017IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10018{
10019 RTUINT128U uSrc1 = *puDst;
10020
10021 if (uShift < 16)
10022 {
10023 int i;
10024
10025 for (i = 0; i < 16 - uShift; ++i)
10026 puDst->au8[i] = uSrc1.au8[i + uShift];
10027 for (i = 16 - uShift; i < 16; ++i)
10028 puDst->au8[i] = 0;
10029 }
10030 else
10031 {
10032 puDst->au64[0] = 0;
10033 puDst->au64[1] = 0;
10034 }
10035}
10036
10037#endif
10038
10039
10040/*
10041 * PSLLDQ / VPSLLDQ
10042 */
10043#ifdef IEM_WITHOUT_ASSEMBLY
10044
10045IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10046{
10047 RTUINT128U uSrc1 = *puDst;
10048
10049 if (uShift < 16)
10050 {
10051 int i;
10052
10053 for (i = 0; i < uShift; ++i)
10054 puDst->au8[i] = 0;
10055 for (i = uShift; i < 16; ++i)
10056 puDst->au8[i] = uSrc1.au8[i - uShift];
10057 }
10058 else
10059 {
10060 puDst->au64[0] = 0;
10061 puDst->au64[1] = 0;
10062 }
10063}
10064
10065#endif
10066
10067
10068/*
10069 * PMADDWD / VPMADDWD
10070 */
10071#ifdef IEM_WITHOUT_ASSEMBLY
10072
10073IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10074{
10075 RTUINT64U uSrc1 = { *puDst };
10076 RTUINT64U uSrc2 = { *puSrc };
10077 RTUINT64U uDst;
10078
10079 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10080 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10081 *puDst = uDst.u;
10082 RT_NOREF(pFpuState);
10083}
10084
10085
10086IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10087{
10088 RTUINT128U uSrc1 = *puDst;
10089
10090 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10091 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10092 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10093 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10094 RT_NOREF(pFpuState);
10095}
10096
10097#endif
10098
10099
10100/*
10101 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10102 */
10103#ifdef IEM_WITHOUT_ASSEMBLY
10104
10105IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10106{
10107 RTUINT64U uSrc1 = { *puDst };
10108 RTUINT64U uSrc2 = { *puSrc };
10109 RTUINT64U uDst;
10110
10111 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10112 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10113 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10114 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10115 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10116 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10117 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10118 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10119 *puDst = uDst.u;
10120 RT_NOREF(pFpuState);
10121}
10122
10123
10124IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10125{
10126 RTUINT128U uSrc1 = *puDst;
10127
10128 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10129 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10130 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10131 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10132 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10133 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10134 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10135 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10136 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10137 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10138 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10139 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10140 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10141 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10142 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10143 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10144 RT_NOREF(pFpuState);
10145}
10146
10147#endif
10148
10149
10150IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10151{
10152 RTUINT128U uSrc1 = *puDst;
10153
10154 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10155 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10156 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10157 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10158 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10159 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10160 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10161 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10162 RT_NOREF(pFpuState);
10163}
10164
10165
10166IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10167{
10168 RTUINT128U uSrc1 = *puDst;
10169
10170 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10171 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10172 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10173 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10174 RT_NOREF(pFpuState);
10175}
10176
10177
10178IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10179 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10180{
10181 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10182 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10183 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10184 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10185 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10186 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10187 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10188 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10189 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10190 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10191 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10192 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10193 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10194 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10195 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10196 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10197 RT_NOREF(pExtState);
10198}
10199
10200
10201IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10202 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10203{
10204 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10205 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10206 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10207 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10208 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10209 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10210 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10211 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10212 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10213 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10214 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10215 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10216 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10217 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10218 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10219 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10220 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10221 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10222 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10223 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10224 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10225 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10226 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10227 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10228 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10229 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10230 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10231 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10232 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10233 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10234 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10235 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10236 RT_NOREF(pExtState);
10237}
10238
10239
10240IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10241 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10242{
10243 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10244 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10245 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10246 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10247 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10248 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10249 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10250 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10251 RT_NOREF(pExtState);
10252}
10253
10254
10255IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10256 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10257{
10258 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10259 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10260 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10261 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10262 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10263 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10264 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10265 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10266 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10267 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10268 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10269 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10270 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10271 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10272 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10273 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10274 RT_NOREF(pExtState);
10275}
10276
10277
10278IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10280{
10281 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10282 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10283 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10284 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10285 RT_NOREF(pExtState);
10286}
10287
10288
10289IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10290 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10291{
10292 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10293 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10294 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10295 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10296 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10297 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10298 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10299 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10300 RT_NOREF(pExtState);
10301}
10302
10303
10304/*
10305 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10306 */
10307#ifdef IEM_WITHOUT_ASSEMBLY
10308
10309IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10310{
10311 RTUINT64U uSrc1 = { *puDst };
10312 RTUINT64U uSrc2 = { *puSrc };
10313 RTUINT64U uDst;
10314
10315 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10316 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10317 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10318 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10319 *puDst = uDst.u;
10320 RT_NOREF(pFpuState);
10321}
10322
10323
10324IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10325{
10326 RTUINT128U uSrc1 = *puDst;
10327
10328 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10329 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10330 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10331 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10332 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10333 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10334 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10335 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10336 RT_NOREF(pFpuState);
10337}
10338
10339#endif
10340
10341IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10342{
10343 RTUINT128U uSrc1 = *puDst;
10344
10345 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10346 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10347 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10348 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10349 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10350 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10351 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10352 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10353 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10354 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10355 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10356 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10357 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10358 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10359 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10360 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10361 RT_NOREF(pFpuState);
10362}
10363
10364
10365IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10366{
10367 RTUINT128U uSrc1 = *puDst;
10368
10369 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10370 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10371 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10372 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10373 RT_NOREF(pFpuState);
10374}
10375
10376
10377IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10378 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10379{
10380 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10381 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10382 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10383 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10384 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10385 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10386 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10387 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10388 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10389 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10390 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10391 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10392 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10393 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10394 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10395 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10396 RT_NOREF(pExtState);
10397}
10398
10399
10400IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10401 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10402{
10403 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10404 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10405 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10406 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10407 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10408 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10409 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10410 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10411 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10412 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10413 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10414 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10415 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10416 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10417 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10418 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10419 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10420 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10421 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10422 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10423 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10424 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10425 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10426 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10427 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10428 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10429 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10430 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10431 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10432 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10433 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10434 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10435 RT_NOREF(pExtState);
10436}
10437
10438
10439IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10440 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10441{
10442 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10443 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10444 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10445 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10446 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10447 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10448 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10449 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10450 RT_NOREF(pExtState);
10451}
10452
10453
10454IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10455 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10456{
10457 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10458 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10459 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10460 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10461 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10462 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10463 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10464 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10465 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10466 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10467 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10468 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10469 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10470 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10471 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10472 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10473 RT_NOREF(pExtState);
10474}
10475
10476
10477IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10478 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10479{
10480 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10481 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10482 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10483 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10484 RT_NOREF(pExtState);
10485}
10486
10487
10488IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10489 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10490{
10491 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10492 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10493 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10494 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10495 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10496 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10497 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10498 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10499 RT_NOREF(pExtState);
10500}
10501
10502
10503/*
10504 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10505 */
10506#ifdef IEM_WITHOUT_ASSEMBLY
10507
10508IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10509{
10510 RTUINT64U uSrc1 = { *puDst };
10511 RTUINT64U uSrc2 = { *puSrc };
10512 RTUINT64U uDst;
10513
10514 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10515 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10516 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10517 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10518 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10519 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10520 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10521 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10522 *puDst = uDst.u;
10523 RT_NOREF(pFpuState);
10524}
10525
10526
10527IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10528{
10529 RTUINT128U uSrc1 = *puDst;
10530
10531 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10532 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10533 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10534 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10535 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10536 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10537 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10538 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10539 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10540 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10541 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10542 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10543 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10544 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10545 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10546 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10547 RT_NOREF(pFpuState);
10548}
10549
10550#endif
10551
10552IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10553{
10554 RTUINT128U uSrc1 = *puDst;
10555
10556 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10557 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10558 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10559 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10560 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10561 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10562 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10563 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10564 RT_NOREF(pFpuState);
10565}
10566
10567
10568IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10569{
10570 RTUINT128U uSrc1 = *puDst;
10571
10572 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10573 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10574 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10575 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10576 RT_NOREF(pFpuState);
10577}
10578
10579
10580IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10581 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10582{
10583 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10584 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10585 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10586 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10587 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10588 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10589 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10590 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10591 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10592 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10593 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10594 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10595 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10596 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10597 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10598 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10599 RT_NOREF(pExtState);
10600}
10601
10602
10603IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10604 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10605{
10606 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10607 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10608 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10609 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10610 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10611 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10612 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10613 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10614 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10615 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10616 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10617 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10618 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10619 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10620 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10621 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10622 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10623 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10624 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10625 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10626 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10627 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10628 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10629 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10630 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10631 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10632 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10633 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10634 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10635 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10636 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10637 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10638 RT_NOREF(pExtState);
10639}
10640
10641
10642IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10643 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10644{
10645 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10646 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10647 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10648 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10649 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10650 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10651 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10652 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10653 RT_NOREF(pExtState);
10654}
10655
10656
10657IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10658 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10659{
10660 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10661 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10662 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10663 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10664 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10665 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10666 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10667 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10668 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10669 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10670 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10671 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10672 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10673 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10674 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10675 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10676 RT_NOREF(pExtState);
10677}
10678
10679
10680IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10681 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10682{
10683 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10684 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10685 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10686 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10687 RT_NOREF(pExtState);
10688}
10689
10690
10691IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10692 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10693{
10694 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10695 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10696 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10697 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10698 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10699 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10700 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10701 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10702 RT_NOREF(pExtState);
10703}
10704
10705
10706/*
10707 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10708 */
10709#ifdef IEM_WITHOUT_ASSEMBLY
10710
10711IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10712{
10713 RTUINT64U uSrc1 = { *puDst };
10714 RTUINT64U uSrc2 = { *puSrc };
10715 RTUINT64U uDst;
10716
10717 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10718 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10719 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10720 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10721 *puDst = uDst.u;
10722 RT_NOREF(pFpuState);
10723}
10724
10725
10726IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10727{
10728 RTUINT128U uSrc1 = *puDst;
10729
10730 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10731 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10732 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10733 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10734 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10735 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10736 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10737 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10738 RT_NOREF(pFpuState);
10739}
10740
10741#endif
10742
10743IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10744{
10745 RTUINT128U uSrc1 = *puDst;
10746
10747 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10748 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10749 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10750 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10751 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10752 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10753 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10754 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10755 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10756 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10757 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10758 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10759 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10760 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10761 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10762 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10763 RT_NOREF(pFpuState);
10764}
10765
10766
10767IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10768{
10769 RTUINT128U uSrc1 = *puDst;
10770
10771 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10772 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10773 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10774 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10775 RT_NOREF(pFpuState);
10776}
10777
10778
10779IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10780 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10781{
10782 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10783 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10784 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10785 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10786 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10787 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10788 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10789 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10790 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10791 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10792 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10793 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10794 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10795 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10796 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10797 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10798 RT_NOREF(pExtState);
10799}
10800
10801
10802IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10803 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10804{
10805 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10806 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10807 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10808 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10809 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10810 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10811 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10812 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10813 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10814 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10815 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10816 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10817 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10818 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10819 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10820 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10821 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10822 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10823 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10824 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10825 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10826 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10827 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10828 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10829 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10830 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10831 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10832 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10833 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10834 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10835 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10836 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10837 RT_NOREF(pExtState);
10838}
10839
10840
10841IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10842 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10843{
10844 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10845 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10846 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10847 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10848 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10849 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10850 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10851 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10852 RT_NOREF(pExtState);
10853}
10854
10855
10856IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10857 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10858{
10859 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10860 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10861 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10862 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10863 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10864 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10865 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10866 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10867 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10868 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10869 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10870 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10871 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10872 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10873 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10874 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10875 RT_NOREF(pExtState);
10876}
10877
10878
10879IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10880 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10881{
10882 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10883 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10884 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10885 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10886 RT_NOREF(pExtState);
10887}
10888
10889
10890IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10891 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10892{
10893 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10894 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10895 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10896 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10897 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10898 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10899 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10900 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10901 RT_NOREF(pExtState);
10902}
10903
10904
10905/*
10906 * PAVGB / VPAVGB / PAVGW / VPAVGW
10907 */
10908#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
10909#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
10910
10911#ifdef IEM_WITHOUT_ASSEMBLY
10912
10913IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10914{
10915 RTUINT64U uSrc1 = { *puDst };
10916 RTUINT64U uSrc2 = { *puSrc };
10917 RTUINT64U uDst;
10918
10919 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
10920 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
10921 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
10922 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
10923 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
10924 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
10925 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
10926 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
10927 *puDst = uDst.u;
10928}
10929
10930
10931IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10932{
10933 RTUINT128U uSrc1 = *puDst;
10934
10935 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10936 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10937 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10938 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10939 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10940 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10941 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10942 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10943 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10944 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10945 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10946 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10947 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10948 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10949 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10950 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10951}
10952
10953
10954IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10955{
10956 RTUINT64U uSrc1 = { *puDst };
10957 RTUINT64U uSrc2 = { *puSrc };
10958 RTUINT64U uDst;
10959
10960 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
10961 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
10962 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
10963 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
10964 *puDst = uDst.u;
10965}
10966
10967
10968IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10969{
10970 RTUINT128U uSrc1 = *puDst;
10971
10972 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
10973 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
10974 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
10975 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
10976 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
10977 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
10978 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
10979 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
10980}
10981
10982#endif
10983
10984IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10985{
10986 RTUINT128U uSrc1 = *puDst;
10987
10988 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10989 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10990 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10991 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10992 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10993 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10994 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10995 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10996 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10997 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10998 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10999 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11000 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11001 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11002 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11003 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11004}
11005
11006
11007IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11008{
11009 RTUINT128U uSrc1 = *puDst;
11010
11011 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11012 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11013 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11014 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11015 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11016 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11017 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11018 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11019 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11020 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11021 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11022 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11023 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11024 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11025 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11026 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11027}
11028
11029
11030IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11031{
11032 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11033 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11034 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11035 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11036 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11037 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11038 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11039 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11040 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11041 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11042 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11043 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11044 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11045 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11046 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11047 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11048}
11049
11050
11051IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11052{
11053 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11054 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11055 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11056 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11057 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11058 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11059 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11060 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11061 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11062 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11063 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11064 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11065 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11066 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11067 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11068 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11069 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11070 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11071 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11072 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11073 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11074 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11075 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11076 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11077 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11078 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11079 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11080 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11081 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11082 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11083 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11084 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11085}
11086
11087
11088IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11089{
11090 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11091 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11092 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11093 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11094 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11095 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11096 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11097 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11098}
11099
11100
11101IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11102{
11103 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11104 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11105 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11106 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11107 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11108 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11109 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11110 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11111 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11112 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11113 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11114 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11115 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11116 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11117 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11118 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11119}
11120
11121#undef PAVGB_EXEC
11122#undef PAVGW_EXEC
11123
11124
11125/*
11126 * PMOVMSKB / VPMOVMSKB
11127 */
11128#ifdef IEM_WITHOUT_ASSEMBLY
11129
11130IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11131{
11132 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11133 uint64_t const uSrc = *pu64Src;
11134 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11135 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11136 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11137 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11138 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11139 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11140 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11141 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11142}
11143
11144
11145IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11146{
11147 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11148 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11149 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11150 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11151 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11152 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11153 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11154 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11155 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11156 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11157 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11158 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11159 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11160 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11161 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11162 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11163 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11164 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11165 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11166}
11167
11168#endif
11169
11170IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11171{
11172 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11173 uint64_t const uSrc0 = puSrc->QWords.qw0;
11174 uint64_t const uSrc1 = puSrc->QWords.qw1;
11175 uint64_t const uSrc2 = puSrc->QWords.qw2;
11176 uint64_t const uSrc3 = puSrc->QWords.qw3;
11177 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11178 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11179 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11180 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11181 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11182 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11183 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11184 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11185 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11186 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11187 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11188 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11189 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11190 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11191 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11192 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11193 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11194 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11195 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11196 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11197 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11198 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11199 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11200 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11201 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11202 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11203 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11204 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11205 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11206 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11207 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11208 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11209}
11210
11211
11212/*
11213 * [V]PSHUFB
11214 */
11215
11216IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11217{
11218 RTUINT64U const uSrc = { *puSrc };
11219 RTUINT64U const uDstIn = { *puDst };
11220 ASMCompilerBarrier();
11221 RTUINT64U uDstOut = { 0 };
11222 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11223 {
11224 uint8_t idxSrc = uSrc.au8[iByte];
11225 if (!(idxSrc & 0x80))
11226 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11227 }
11228 *puDst = uDstOut.u;
11229 RT_NOREF(pFpuState);
11230}
11231
11232
11233IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11234{
11235 RTUINT128U const uSrc = *puSrc;
11236 RTUINT128U const uDstIn = *puDst;
11237 ASMCompilerBarrier();
11238 puDst->au64[0] = 0;
11239 puDst->au64[1] = 0;
11240 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11241 {
11242 uint8_t idxSrc = uSrc.au8[iByte];
11243 if (!(idxSrc & 0x80))
11244 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11245 }
11246 RT_NOREF(pFpuState);
11247}
11248
11249
11250IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11251 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11252{
11253 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11254 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11255 ASMCompilerBarrier();
11256 puDst->au64[0] = 0;
11257 puDst->au64[1] = 0;
11258 for (unsigned iByte = 0; iByte < 16; iByte++)
11259 {
11260 uint8_t idxSrc = uSrc2.au8[iByte];
11261 if (!(idxSrc & 0x80))
11262 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11263 }
11264 RT_NOREF(pExtState);
11265}
11266
11267
11268IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11269 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11270{
11271 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11272 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11273 ASMCompilerBarrier();
11274 puDst->au64[0] = 0;
11275 puDst->au64[1] = 0;
11276 puDst->au64[2] = 0;
11277 puDst->au64[3] = 0;
11278 for (unsigned iByte = 0; iByte < 16; iByte++)
11279 {
11280 uint8_t idxSrc = uSrc2.au8[iByte];
11281 if (!(idxSrc & 0x80))
11282 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11283 }
11284 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11285 {
11286 uint8_t idxSrc = uSrc2.au8[iByte];
11287 if (!(idxSrc & 0x80))
11288 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11289 }
11290 RT_NOREF(pExtState);
11291}
11292
11293
11294/*
11295 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11296 */
11297#ifdef IEM_WITHOUT_ASSEMBLY
11298
11299IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11300{
11301 uint64_t const uSrc = *puSrc;
11302 ASMCompilerBarrier();
11303 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11304 uSrc >> (((bEvil >> 2) & 3) * 16),
11305 uSrc >> (((bEvil >> 4) & 3) * 16),
11306 uSrc >> (((bEvil >> 6) & 3) * 16));
11307}
11308
11309
11310IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11311{
11312 puDst->QWords.qw0 = puSrc->QWords.qw0;
11313 uint64_t const uSrc = puSrc->QWords.qw1;
11314 ASMCompilerBarrier();
11315 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11316 uSrc >> (((bEvil >> 2) & 3) * 16),
11317 uSrc >> (((bEvil >> 4) & 3) * 16),
11318 uSrc >> (((bEvil >> 6) & 3) * 16));
11319}
11320
11321#endif
11322
11323IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11324{
11325 puDst->QWords.qw0 = puSrc->QWords.qw0;
11326 uint64_t const uSrc1 = puSrc->QWords.qw1;
11327 puDst->QWords.qw2 = puSrc->QWords.qw2;
11328 uint64_t const uSrc3 = puSrc->QWords.qw3;
11329 ASMCompilerBarrier();
11330 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11331 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11332 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11333 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11334 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11335 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11336 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11337 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11338}
11339
11340#ifdef IEM_WITHOUT_ASSEMBLY
11341IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11342{
11343 puDst->QWords.qw1 = puSrc->QWords.qw1;
11344 uint64_t const uSrc = puSrc->QWords.qw0;
11345 ASMCompilerBarrier();
11346 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11347 uSrc >> (((bEvil >> 2) & 3) * 16),
11348 uSrc >> (((bEvil >> 4) & 3) * 16),
11349 uSrc >> (((bEvil >> 6) & 3) * 16));
11350
11351}
11352#endif
11353
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11356{
11357 puDst->QWords.qw3 = puSrc->QWords.qw3;
11358 uint64_t const uSrc2 = puSrc->QWords.qw2;
11359 puDst->QWords.qw1 = puSrc->QWords.qw1;
11360 uint64_t const uSrc0 = puSrc->QWords.qw0;
11361 ASMCompilerBarrier();
11362 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11363 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11364 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11365 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11366 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11367 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11368 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11369 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11370
11371}
11372
11373
11374#ifdef IEM_WITHOUT_ASSEMBLY
11375IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11376{
11377 RTUINT128U const uSrc = *puSrc;
11378 ASMCompilerBarrier();
11379 puDst->au32[0] = uSrc.au32[bEvil & 3];
11380 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11381 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11382 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11383}
11384#endif
11385
11386
11387IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11388{
11389 RTUINT256U const uSrc = *puSrc;
11390 ASMCompilerBarrier();
11391 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11392 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11393 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11394 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11395 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11396 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11397 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11398 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11399}
11400
11401
11402/*
11403 * PUNPCKHBW - high bytes -> words
11404 */
11405#ifdef IEM_WITHOUT_ASSEMBLY
11406
11407IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11408{
11409 RTUINT64U const uSrc2 = { *puSrc };
11410 RTUINT64U const uSrc1 = { *puDst };
11411 ASMCompilerBarrier();
11412 RTUINT64U uDstOut;
11413 uDstOut.au8[0] = uSrc1.au8[4];
11414 uDstOut.au8[1] = uSrc2.au8[4];
11415 uDstOut.au8[2] = uSrc1.au8[5];
11416 uDstOut.au8[3] = uSrc2.au8[5];
11417 uDstOut.au8[4] = uSrc1.au8[6];
11418 uDstOut.au8[5] = uSrc2.au8[6];
11419 uDstOut.au8[6] = uSrc1.au8[7];
11420 uDstOut.au8[7] = uSrc2.au8[7];
11421 *puDst = uDstOut.u;
11422}
11423
11424
11425IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11426{
11427 RTUINT128U const uSrc2 = *puSrc;
11428 RTUINT128U const uSrc1 = *puDst;
11429 ASMCompilerBarrier();
11430 RTUINT128U uDstOut;
11431 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11432 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11433 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11434 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11435 uDstOut.au8[ 4] = uSrc1.au8[10];
11436 uDstOut.au8[ 5] = uSrc2.au8[10];
11437 uDstOut.au8[ 6] = uSrc1.au8[11];
11438 uDstOut.au8[ 7] = uSrc2.au8[11];
11439 uDstOut.au8[ 8] = uSrc1.au8[12];
11440 uDstOut.au8[ 9] = uSrc2.au8[12];
11441 uDstOut.au8[10] = uSrc1.au8[13];
11442 uDstOut.au8[11] = uSrc2.au8[13];
11443 uDstOut.au8[12] = uSrc1.au8[14];
11444 uDstOut.au8[13] = uSrc2.au8[14];
11445 uDstOut.au8[14] = uSrc1.au8[15];
11446 uDstOut.au8[15] = uSrc2.au8[15];
11447 *puDst = uDstOut;
11448}
11449
11450#endif
11451
11452IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11453{
11454 RTUINT128U const uSrc2 = *puSrc2;
11455 RTUINT128U const uSrc1 = *puSrc1;
11456 ASMCompilerBarrier();
11457 RTUINT128U uDstOut;
11458 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11459 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11460 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11461 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11462 uDstOut.au8[ 4] = uSrc1.au8[10];
11463 uDstOut.au8[ 5] = uSrc2.au8[10];
11464 uDstOut.au8[ 6] = uSrc1.au8[11];
11465 uDstOut.au8[ 7] = uSrc2.au8[11];
11466 uDstOut.au8[ 8] = uSrc1.au8[12];
11467 uDstOut.au8[ 9] = uSrc2.au8[12];
11468 uDstOut.au8[10] = uSrc1.au8[13];
11469 uDstOut.au8[11] = uSrc2.au8[13];
11470 uDstOut.au8[12] = uSrc1.au8[14];
11471 uDstOut.au8[13] = uSrc2.au8[14];
11472 uDstOut.au8[14] = uSrc1.au8[15];
11473 uDstOut.au8[15] = uSrc2.au8[15];
11474 *puDst = uDstOut;
11475}
11476
11477
11478IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11479{
11480 RTUINT256U const uSrc2 = *puSrc2;
11481 RTUINT256U const uSrc1 = *puSrc1;
11482 ASMCompilerBarrier();
11483 RTUINT256U uDstOut;
11484 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11485 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11486 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11487 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11488 uDstOut.au8[ 4] = uSrc1.au8[10];
11489 uDstOut.au8[ 5] = uSrc2.au8[10];
11490 uDstOut.au8[ 6] = uSrc1.au8[11];
11491 uDstOut.au8[ 7] = uSrc2.au8[11];
11492 uDstOut.au8[ 8] = uSrc1.au8[12];
11493 uDstOut.au8[ 9] = uSrc2.au8[12];
11494 uDstOut.au8[10] = uSrc1.au8[13];
11495 uDstOut.au8[11] = uSrc2.au8[13];
11496 uDstOut.au8[12] = uSrc1.au8[14];
11497 uDstOut.au8[13] = uSrc2.au8[14];
11498 uDstOut.au8[14] = uSrc1.au8[15];
11499 uDstOut.au8[15] = uSrc2.au8[15];
11500 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11501 uDstOut.au8[16] = uSrc1.au8[24];
11502 uDstOut.au8[17] = uSrc2.au8[24];
11503 uDstOut.au8[18] = uSrc1.au8[25];
11504 uDstOut.au8[19] = uSrc2.au8[25];
11505 uDstOut.au8[20] = uSrc1.au8[26];
11506 uDstOut.au8[21] = uSrc2.au8[26];
11507 uDstOut.au8[22] = uSrc1.au8[27];
11508 uDstOut.au8[23] = uSrc2.au8[27];
11509 uDstOut.au8[24] = uSrc1.au8[28];
11510 uDstOut.au8[25] = uSrc2.au8[28];
11511 uDstOut.au8[26] = uSrc1.au8[29];
11512 uDstOut.au8[27] = uSrc2.au8[29];
11513 uDstOut.au8[28] = uSrc1.au8[30];
11514 uDstOut.au8[29] = uSrc2.au8[30];
11515 uDstOut.au8[30] = uSrc1.au8[31];
11516 uDstOut.au8[31] = uSrc2.au8[31];
11517 *puDst = uDstOut;
11518}
11519
11520
11521/*
11522 * PUNPCKHBW - high words -> dwords
11523 */
11524#ifdef IEM_WITHOUT_ASSEMBLY
11525
11526IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11527{
11528 RTUINT64U const uSrc2 = { *puSrc };
11529 RTUINT64U const uSrc1 = { *puDst };
11530 ASMCompilerBarrier();
11531 RTUINT64U uDstOut;
11532 uDstOut.au16[0] = uSrc1.au16[2];
11533 uDstOut.au16[1] = uSrc2.au16[2];
11534 uDstOut.au16[2] = uSrc1.au16[3];
11535 uDstOut.au16[3] = uSrc2.au16[3];
11536 *puDst = uDstOut.u;
11537}
11538
11539
11540IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11541{
11542 RTUINT128U const uSrc2 = *puSrc;
11543 RTUINT128U const uSrc1 = *puDst;
11544 ASMCompilerBarrier();
11545 RTUINT128U uDstOut;
11546 uDstOut.au16[0] = uSrc1.au16[4];
11547 uDstOut.au16[1] = uSrc2.au16[4];
11548 uDstOut.au16[2] = uSrc1.au16[5];
11549 uDstOut.au16[3] = uSrc2.au16[5];
11550 uDstOut.au16[4] = uSrc1.au16[6];
11551 uDstOut.au16[5] = uSrc2.au16[6];
11552 uDstOut.au16[6] = uSrc1.au16[7];
11553 uDstOut.au16[7] = uSrc2.au16[7];
11554 *puDst = uDstOut;
11555}
11556
11557#endif
11558
11559IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11560{
11561 RTUINT128U const uSrc2 = *puSrc2;
11562 RTUINT128U const uSrc1 = *puSrc1;
11563 ASMCompilerBarrier();
11564 RTUINT128U uDstOut;
11565 uDstOut.au16[0] = uSrc1.au16[4];
11566 uDstOut.au16[1] = uSrc2.au16[4];
11567 uDstOut.au16[2] = uSrc1.au16[5];
11568 uDstOut.au16[3] = uSrc2.au16[5];
11569 uDstOut.au16[4] = uSrc1.au16[6];
11570 uDstOut.au16[5] = uSrc2.au16[6];
11571 uDstOut.au16[6] = uSrc1.au16[7];
11572 uDstOut.au16[7] = uSrc2.au16[7];
11573 *puDst = uDstOut;
11574}
11575
11576
11577IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11578{
11579 RTUINT256U const uSrc2 = *puSrc2;
11580 RTUINT256U const uSrc1 = *puSrc1;
11581 ASMCompilerBarrier();
11582 RTUINT256U uDstOut;
11583 uDstOut.au16[0] = uSrc1.au16[4];
11584 uDstOut.au16[1] = uSrc2.au16[4];
11585 uDstOut.au16[2] = uSrc1.au16[5];
11586 uDstOut.au16[3] = uSrc2.au16[5];
11587 uDstOut.au16[4] = uSrc1.au16[6];
11588 uDstOut.au16[5] = uSrc2.au16[6];
11589 uDstOut.au16[6] = uSrc1.au16[7];
11590 uDstOut.au16[7] = uSrc2.au16[7];
11591
11592 uDstOut.au16[8] = uSrc1.au16[12];
11593 uDstOut.au16[9] = uSrc2.au16[12];
11594 uDstOut.au16[10] = uSrc1.au16[13];
11595 uDstOut.au16[11] = uSrc2.au16[13];
11596 uDstOut.au16[12] = uSrc1.au16[14];
11597 uDstOut.au16[13] = uSrc2.au16[14];
11598 uDstOut.au16[14] = uSrc1.au16[15];
11599 uDstOut.au16[15] = uSrc2.au16[15];
11600 *puDst = uDstOut;
11601}
11602
11603
11604/*
11605 * PUNPCKHBW - high dwords -> qword(s)
11606 */
11607#ifdef IEM_WITHOUT_ASSEMBLY
11608
11609IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11610{
11611 RTUINT64U const uSrc2 = { *puSrc };
11612 RTUINT64U const uSrc1 = { *puDst };
11613 ASMCompilerBarrier();
11614 RTUINT64U uDstOut;
11615 uDstOut.au32[0] = uSrc1.au32[1];
11616 uDstOut.au32[1] = uSrc2.au32[1];
11617 *puDst = uDstOut.u;
11618}
11619
11620
11621IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11622{
11623 RTUINT128U const uSrc2 = *puSrc;
11624 RTUINT128U const uSrc1 = *puDst;
11625 ASMCompilerBarrier();
11626 RTUINT128U uDstOut;
11627 uDstOut.au32[0] = uSrc1.au32[2];
11628 uDstOut.au32[1] = uSrc2.au32[2];
11629 uDstOut.au32[2] = uSrc1.au32[3];
11630 uDstOut.au32[3] = uSrc2.au32[3];
11631 *puDst = uDstOut;
11632}
11633
11634#endif
11635
11636IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11637{
11638 RTUINT128U const uSrc2 = *puSrc2;
11639 RTUINT128U const uSrc1 = *puSrc1;
11640 ASMCompilerBarrier();
11641 RTUINT128U uDstOut;
11642 uDstOut.au32[0] = uSrc1.au32[2];
11643 uDstOut.au32[1] = uSrc2.au32[2];
11644 uDstOut.au32[2] = uSrc1.au32[3];
11645 uDstOut.au32[3] = uSrc2.au32[3];
11646 *puDst = uDstOut;
11647}
11648
11649
11650IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11651{
11652 RTUINT256U const uSrc2 = *puSrc2;
11653 RTUINT256U const uSrc1 = *puSrc1;
11654 ASMCompilerBarrier();
11655 RTUINT256U uDstOut;
11656 uDstOut.au32[0] = uSrc1.au32[2];
11657 uDstOut.au32[1] = uSrc2.au32[2];
11658 uDstOut.au32[2] = uSrc1.au32[3];
11659 uDstOut.au32[3] = uSrc2.au32[3];
11660
11661 uDstOut.au32[4] = uSrc1.au32[6];
11662 uDstOut.au32[5] = uSrc2.au32[6];
11663 uDstOut.au32[6] = uSrc1.au32[7];
11664 uDstOut.au32[7] = uSrc2.au32[7];
11665 *puDst = uDstOut;
11666}
11667
11668
11669/*
11670 * PUNPCKHQDQ -> High qwords -> double qword(s).
11671 */
11672#ifdef IEM_WITHOUT_ASSEMBLY
11673IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11674{
11675 RTUINT128U const uSrc2 = *puSrc;
11676 RTUINT128U const uSrc1 = *puDst;
11677 ASMCompilerBarrier();
11678 RTUINT128U uDstOut;
11679 uDstOut.au64[0] = uSrc1.au64[1];
11680 uDstOut.au64[1] = uSrc2.au64[1];
11681 *puDst = uDstOut;
11682}
11683#endif
11684
11685
11686IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11687{
11688 RTUINT128U const uSrc2 = *puSrc2;
11689 RTUINT128U const uSrc1 = *puSrc1;
11690 ASMCompilerBarrier();
11691 RTUINT128U uDstOut;
11692 uDstOut.au64[0] = uSrc1.au64[1];
11693 uDstOut.au64[1] = uSrc2.au64[1];
11694 *puDst = uDstOut;
11695}
11696
11697
11698IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11699{
11700 RTUINT256U const uSrc2 = *puSrc2;
11701 RTUINT256U const uSrc1 = *puSrc1;
11702 ASMCompilerBarrier();
11703 RTUINT256U uDstOut;
11704 uDstOut.au64[0] = uSrc1.au64[1];
11705 uDstOut.au64[1] = uSrc2.au64[1];
11706
11707 uDstOut.au64[2] = uSrc1.au64[3];
11708 uDstOut.au64[3] = uSrc2.au64[3];
11709 *puDst = uDstOut;
11710}
11711
11712
11713/*
11714 * PUNPCKLBW - low bytes -> words
11715 */
11716#ifdef IEM_WITHOUT_ASSEMBLY
11717
11718IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11719{
11720 RTUINT64U const uSrc2 = { *puSrc };
11721 RTUINT64U const uSrc1 = { *puDst };
11722 ASMCompilerBarrier();
11723 RTUINT64U uDstOut;
11724 uDstOut.au8[0] = uSrc1.au8[0];
11725 uDstOut.au8[1] = uSrc2.au8[0];
11726 uDstOut.au8[2] = uSrc1.au8[1];
11727 uDstOut.au8[3] = uSrc2.au8[1];
11728 uDstOut.au8[4] = uSrc1.au8[2];
11729 uDstOut.au8[5] = uSrc2.au8[2];
11730 uDstOut.au8[6] = uSrc1.au8[3];
11731 uDstOut.au8[7] = uSrc2.au8[3];
11732 *puDst = uDstOut.u;
11733}
11734
11735
11736IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11737{
11738 RTUINT128U const uSrc2 = *puSrc;
11739 RTUINT128U const uSrc1 = *puDst;
11740 ASMCompilerBarrier();
11741 RTUINT128U uDstOut;
11742 uDstOut.au8[ 0] = uSrc1.au8[0];
11743 uDstOut.au8[ 1] = uSrc2.au8[0];
11744 uDstOut.au8[ 2] = uSrc1.au8[1];
11745 uDstOut.au8[ 3] = uSrc2.au8[1];
11746 uDstOut.au8[ 4] = uSrc1.au8[2];
11747 uDstOut.au8[ 5] = uSrc2.au8[2];
11748 uDstOut.au8[ 6] = uSrc1.au8[3];
11749 uDstOut.au8[ 7] = uSrc2.au8[3];
11750 uDstOut.au8[ 8] = uSrc1.au8[4];
11751 uDstOut.au8[ 9] = uSrc2.au8[4];
11752 uDstOut.au8[10] = uSrc1.au8[5];
11753 uDstOut.au8[11] = uSrc2.au8[5];
11754 uDstOut.au8[12] = uSrc1.au8[6];
11755 uDstOut.au8[13] = uSrc2.au8[6];
11756 uDstOut.au8[14] = uSrc1.au8[7];
11757 uDstOut.au8[15] = uSrc2.au8[7];
11758 *puDst = uDstOut;
11759}
11760
11761#endif
11762
11763IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11764{
11765 RTUINT128U const uSrc2 = *puSrc2;
11766 RTUINT128U const uSrc1 = *puSrc1;
11767 ASMCompilerBarrier();
11768 RTUINT128U uDstOut;
11769 uDstOut.au8[ 0] = uSrc1.au8[0];
11770 uDstOut.au8[ 1] = uSrc2.au8[0];
11771 uDstOut.au8[ 2] = uSrc1.au8[1];
11772 uDstOut.au8[ 3] = uSrc2.au8[1];
11773 uDstOut.au8[ 4] = uSrc1.au8[2];
11774 uDstOut.au8[ 5] = uSrc2.au8[2];
11775 uDstOut.au8[ 6] = uSrc1.au8[3];
11776 uDstOut.au8[ 7] = uSrc2.au8[3];
11777 uDstOut.au8[ 8] = uSrc1.au8[4];
11778 uDstOut.au8[ 9] = uSrc2.au8[4];
11779 uDstOut.au8[10] = uSrc1.au8[5];
11780 uDstOut.au8[11] = uSrc2.au8[5];
11781 uDstOut.au8[12] = uSrc1.au8[6];
11782 uDstOut.au8[13] = uSrc2.au8[6];
11783 uDstOut.au8[14] = uSrc1.au8[7];
11784 uDstOut.au8[15] = uSrc2.au8[7];
11785 *puDst = uDstOut;
11786}
11787
11788
11789IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11790{
11791 RTUINT256U const uSrc2 = *puSrc2;
11792 RTUINT256U const uSrc1 = *puSrc1;
11793 ASMCompilerBarrier();
11794 RTUINT256U uDstOut;
11795 uDstOut.au8[ 0] = uSrc1.au8[0];
11796 uDstOut.au8[ 1] = uSrc2.au8[0];
11797 uDstOut.au8[ 2] = uSrc1.au8[1];
11798 uDstOut.au8[ 3] = uSrc2.au8[1];
11799 uDstOut.au8[ 4] = uSrc1.au8[2];
11800 uDstOut.au8[ 5] = uSrc2.au8[2];
11801 uDstOut.au8[ 6] = uSrc1.au8[3];
11802 uDstOut.au8[ 7] = uSrc2.au8[3];
11803 uDstOut.au8[ 8] = uSrc1.au8[4];
11804 uDstOut.au8[ 9] = uSrc2.au8[4];
11805 uDstOut.au8[10] = uSrc1.au8[5];
11806 uDstOut.au8[11] = uSrc2.au8[5];
11807 uDstOut.au8[12] = uSrc1.au8[6];
11808 uDstOut.au8[13] = uSrc2.au8[6];
11809 uDstOut.au8[14] = uSrc1.au8[7];
11810 uDstOut.au8[15] = uSrc2.au8[7];
11811 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11812 uDstOut.au8[16] = uSrc1.au8[16];
11813 uDstOut.au8[17] = uSrc2.au8[16];
11814 uDstOut.au8[18] = uSrc1.au8[17];
11815 uDstOut.au8[19] = uSrc2.au8[17];
11816 uDstOut.au8[20] = uSrc1.au8[18];
11817 uDstOut.au8[21] = uSrc2.au8[18];
11818 uDstOut.au8[22] = uSrc1.au8[19];
11819 uDstOut.au8[23] = uSrc2.au8[19];
11820 uDstOut.au8[24] = uSrc1.au8[20];
11821 uDstOut.au8[25] = uSrc2.au8[20];
11822 uDstOut.au8[26] = uSrc1.au8[21];
11823 uDstOut.au8[27] = uSrc2.au8[21];
11824 uDstOut.au8[28] = uSrc1.au8[22];
11825 uDstOut.au8[29] = uSrc2.au8[22];
11826 uDstOut.au8[30] = uSrc1.au8[23];
11827 uDstOut.au8[31] = uSrc2.au8[23];
11828 *puDst = uDstOut;
11829}
11830
11831
11832/*
11833 * PUNPCKLBW - low words -> dwords
11834 */
11835#ifdef IEM_WITHOUT_ASSEMBLY
11836
11837IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11838{
11839 RTUINT64U const uSrc2 = { *puSrc };
11840 RTUINT64U const uSrc1 = { *puDst };
11841 ASMCompilerBarrier();
11842 RTUINT64U uDstOut;
11843 uDstOut.au16[0] = uSrc1.au16[0];
11844 uDstOut.au16[1] = uSrc2.au16[0];
11845 uDstOut.au16[2] = uSrc1.au16[1];
11846 uDstOut.au16[3] = uSrc2.au16[1];
11847 *puDst = uDstOut.u;
11848}
11849
11850
11851IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11852{
11853 RTUINT128U const uSrc2 = *puSrc;
11854 RTUINT128U const uSrc1 = *puDst;
11855 ASMCompilerBarrier();
11856 RTUINT128U uDstOut;
11857 uDstOut.au16[0] = uSrc1.au16[0];
11858 uDstOut.au16[1] = uSrc2.au16[0];
11859 uDstOut.au16[2] = uSrc1.au16[1];
11860 uDstOut.au16[3] = uSrc2.au16[1];
11861 uDstOut.au16[4] = uSrc1.au16[2];
11862 uDstOut.au16[5] = uSrc2.au16[2];
11863 uDstOut.au16[6] = uSrc1.au16[3];
11864 uDstOut.au16[7] = uSrc2.au16[3];
11865 *puDst = uDstOut;
11866}
11867
11868#endif
11869
11870IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11871{
11872 RTUINT128U const uSrc2 = *puSrc2;
11873 RTUINT128U const uSrc1 = *puSrc1;
11874 ASMCompilerBarrier();
11875 RTUINT128U uDstOut;
11876 uDstOut.au16[0] = uSrc1.au16[0];
11877 uDstOut.au16[1] = uSrc2.au16[0];
11878 uDstOut.au16[2] = uSrc1.au16[1];
11879 uDstOut.au16[3] = uSrc2.au16[1];
11880 uDstOut.au16[4] = uSrc1.au16[2];
11881 uDstOut.au16[5] = uSrc2.au16[2];
11882 uDstOut.au16[6] = uSrc1.au16[3];
11883 uDstOut.au16[7] = uSrc2.au16[3];
11884 *puDst = uDstOut;
11885}
11886
11887
11888IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11889{
11890 RTUINT256U const uSrc2 = *puSrc2;
11891 RTUINT256U const uSrc1 = *puSrc1;
11892 ASMCompilerBarrier();
11893 RTUINT256U uDstOut;
11894 uDstOut.au16[0] = uSrc1.au16[0];
11895 uDstOut.au16[1] = uSrc2.au16[0];
11896 uDstOut.au16[2] = uSrc1.au16[1];
11897 uDstOut.au16[3] = uSrc2.au16[1];
11898 uDstOut.au16[4] = uSrc1.au16[2];
11899 uDstOut.au16[5] = uSrc2.au16[2];
11900 uDstOut.au16[6] = uSrc1.au16[3];
11901 uDstOut.au16[7] = uSrc2.au16[3];
11902
11903 uDstOut.au16[8] = uSrc1.au16[8];
11904 uDstOut.au16[9] = uSrc2.au16[8];
11905 uDstOut.au16[10] = uSrc1.au16[9];
11906 uDstOut.au16[11] = uSrc2.au16[9];
11907 uDstOut.au16[12] = uSrc1.au16[10];
11908 uDstOut.au16[13] = uSrc2.au16[10];
11909 uDstOut.au16[14] = uSrc1.au16[11];
11910 uDstOut.au16[15] = uSrc2.au16[11];
11911 *puDst = uDstOut;
11912}
11913
11914
11915/*
11916 * PUNPCKLBW - low dwords -> qword(s)
11917 */
11918#ifdef IEM_WITHOUT_ASSEMBLY
11919
11920IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11921{
11922 RTUINT64U const uSrc2 = { *puSrc };
11923 RTUINT64U const uSrc1 = { *puDst };
11924 ASMCompilerBarrier();
11925 RTUINT64U uDstOut;
11926 uDstOut.au32[0] = uSrc1.au32[0];
11927 uDstOut.au32[1] = uSrc2.au32[0];
11928 *puDst = uDstOut.u;
11929}
11930
11931
11932IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11933{
11934 RTUINT128U const uSrc2 = *puSrc;
11935 RTUINT128U const uSrc1 = *puDst;
11936 ASMCompilerBarrier();
11937 RTUINT128U uDstOut;
11938 uDstOut.au32[0] = uSrc1.au32[0];
11939 uDstOut.au32[1] = uSrc2.au32[0];
11940 uDstOut.au32[2] = uSrc1.au32[1];
11941 uDstOut.au32[3] = uSrc2.au32[1];
11942 *puDst = uDstOut;
11943}
11944
11945#endif
11946
11947IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11948{
11949 RTUINT128U const uSrc2 = *puSrc2;
11950 RTUINT128U const uSrc1 = *puSrc1;
11951 ASMCompilerBarrier();
11952 RTUINT128U uDstOut;
11953 uDstOut.au32[0] = uSrc1.au32[0];
11954 uDstOut.au32[1] = uSrc2.au32[0];
11955 uDstOut.au32[2] = uSrc1.au32[1];
11956 uDstOut.au32[3] = uSrc2.au32[1];
11957 *puDst = uDstOut;
11958}
11959
11960
11961IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11962{
11963 RTUINT256U const uSrc2 = *puSrc2;
11964 RTUINT256U const uSrc1 = *puSrc1;
11965 ASMCompilerBarrier();
11966 RTUINT256U uDstOut;
11967 uDstOut.au32[0] = uSrc1.au32[0];
11968 uDstOut.au32[1] = uSrc2.au32[0];
11969 uDstOut.au32[2] = uSrc1.au32[1];
11970 uDstOut.au32[3] = uSrc2.au32[1];
11971
11972 uDstOut.au32[4] = uSrc1.au32[4];
11973 uDstOut.au32[5] = uSrc2.au32[4];
11974 uDstOut.au32[6] = uSrc1.au32[5];
11975 uDstOut.au32[7] = uSrc2.au32[5];
11976 *puDst = uDstOut;
11977}
11978
11979
11980/*
11981 * PUNPCKLQDQ -> Low qwords -> double qword(s).
11982 */
11983#ifdef IEM_WITHOUT_ASSEMBLY
11984IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11985{
11986 RTUINT128U const uSrc2 = *puSrc;
11987 RTUINT128U const uSrc1 = *puDst;
11988 ASMCompilerBarrier();
11989 RTUINT128U uDstOut;
11990 uDstOut.au64[0] = uSrc1.au64[0];
11991 uDstOut.au64[1] = uSrc2.au64[0];
11992 *puDst = uDstOut;
11993}
11994#endif
11995
11996
11997IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11998{
11999 RTUINT128U const uSrc2 = *puSrc2;
12000 RTUINT128U const uSrc1 = *puSrc1;
12001 ASMCompilerBarrier();
12002 RTUINT128U uDstOut;
12003 uDstOut.au64[0] = uSrc1.au64[0];
12004 uDstOut.au64[1] = uSrc2.au64[0];
12005 *puDst = uDstOut;
12006}
12007
12008
12009IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12010{
12011 RTUINT256U const uSrc2 = *puSrc2;
12012 RTUINT256U const uSrc1 = *puSrc1;
12013 ASMCompilerBarrier();
12014 RTUINT256U uDstOut;
12015 uDstOut.au64[0] = uSrc1.au64[0];
12016 uDstOut.au64[1] = uSrc2.au64[0];
12017
12018 uDstOut.au64[2] = uSrc1.au64[2];
12019 uDstOut.au64[3] = uSrc2.au64[2];
12020 *puDst = uDstOut;
12021}
12022
12023
12024/*
12025 * PACKSSWB - signed words -> signed bytes
12026 */
12027
12028#ifdef IEM_WITHOUT_ASSEMBLY
12029
12030IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12031{
12032 RTUINT64U const uSrc2 = { *puSrc };
12033 RTUINT64U const uSrc1 = { *puDst };
12034 ASMCompilerBarrier();
12035 RTUINT64U uDstOut;
12036 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12037 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12038 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12039 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12040 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12041 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12042 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12043 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12044 *puDst = uDstOut.u;
12045}
12046
12047
12048IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12049{
12050 RTUINT128U const uSrc2 = *puSrc;
12051 RTUINT128U const uSrc1 = *puDst;
12052 ASMCompilerBarrier();
12053 RTUINT128U uDstOut;
12054 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12055 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12056 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12057 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12058 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12059 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12060 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12061 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12062 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12063 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12064 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12065 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12066 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12067 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12068 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12069 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12070 *puDst = uDstOut;
12071}
12072
12073#endif
12074
12075IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12076{
12077 RTUINT128U const uSrc2 = *puSrc2;
12078 RTUINT128U const uSrc1 = *puSrc1;
12079 ASMCompilerBarrier();
12080 RTUINT128U uDstOut;
12081 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12082 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12083 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12084 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12085 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12086 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12087 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12088 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12089 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12090 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12091 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12092 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12093 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12094 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12095 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12096 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12097 *puDst = uDstOut;
12098}
12099
12100
12101IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12102{
12103 RTUINT256U const uSrc2 = *puSrc2;
12104 RTUINT256U const uSrc1 = *puSrc1;
12105 ASMCompilerBarrier();
12106 RTUINT256U uDstOut;
12107 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12108 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12109 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12110 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12111 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12112 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12113 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12114 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12115 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12116 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12117 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12118 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12119 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12120 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12121 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12122 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12123
12124 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12125 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12126 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12127 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12128 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12129 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12130 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12131 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12132 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12133 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12134 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12135 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12136 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12137 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12138 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12139 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12140 *puDst = uDstOut;
12141}
12142
12143
12144/*
12145 * PACKUSWB - signed words -> unsigned bytes
12146 */
12147#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12148 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12149 ? (uint8_t)(a_iWord) \
12150 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12151
12152#ifdef IEM_WITHOUT_ASSEMBLY
12153
12154IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12155{
12156 RTUINT64U const uSrc2 = { *puSrc };
12157 RTUINT64U const uSrc1 = { *puDst };
12158 ASMCompilerBarrier();
12159 RTUINT64U uDstOut;
12160 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12161 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12162 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12163 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12164 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12165 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12166 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12167 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12168 *puDst = uDstOut.u;
12169}
12170
12171
12172IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12173{
12174 RTUINT128U const uSrc2 = *puSrc;
12175 RTUINT128U const uSrc1 = *puDst;
12176 ASMCompilerBarrier();
12177 RTUINT128U uDstOut;
12178 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12179 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12180 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12181 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12182 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12183 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12184 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12185 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12186 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12187 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12188 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12189 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12190 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12191 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12192 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12193 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12194 *puDst = uDstOut;
12195}
12196
12197#endif
12198
12199IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12200{
12201 RTUINT128U const uSrc2 = *puSrc2;
12202 RTUINT128U const uSrc1 = *puSrc1;
12203 ASMCompilerBarrier();
12204 RTUINT128U uDstOut;
12205 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12206 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12207 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12208 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12209 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12210 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12211 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12212 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12213 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12214 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12215 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12216 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12217 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12218 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12219 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12220 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12221 *puDst = uDstOut;
12222}
12223
12224
12225IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12226{
12227 RTUINT256U const uSrc2 = *puSrc2;
12228 RTUINT256U const uSrc1 = *puSrc1;
12229 ASMCompilerBarrier();
12230 RTUINT256U uDstOut;
12231 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12232 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12233 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12234 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12235 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12236 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12237 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12238 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12239 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12240 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12241 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12242 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12243 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12244 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12245 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12246 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12247
12248 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12249 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12250 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12251 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12252 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12253 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12254 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12255 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12256 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12257 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12258 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12259 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12260 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12261 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12262 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12263 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12264 *puDst = uDstOut;
12265}
12266
12267
12268/*
12269 * PACKSSDW - signed dwords -> signed words
12270 */
12271
12272#ifdef IEM_WITHOUT_ASSEMBLY
12273
12274IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12275{
12276 RTUINT64U const uSrc2 = { *puSrc };
12277 RTUINT64U const uSrc1 = { *puDst };
12278 ASMCompilerBarrier();
12279 RTUINT64U uDstOut;
12280 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12281 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12282 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12283 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12284 *puDst = uDstOut.u;
12285}
12286
12287
12288IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12289{
12290 RTUINT128U const uSrc2 = *puSrc;
12291 RTUINT128U const uSrc1 = *puDst;
12292 ASMCompilerBarrier();
12293 RTUINT128U uDstOut;
12294 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12295 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12296 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12297 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12298 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12299 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12300 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12301 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12302 *puDst = uDstOut;
12303}
12304
12305#endif
12306
12307IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12308{
12309 RTUINT128U const uSrc2 = *puSrc2;
12310 RTUINT128U const uSrc1 = *puSrc1;
12311 ASMCompilerBarrier();
12312 RTUINT128U uDstOut;
12313 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12314 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12315 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12316 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12317 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12318 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12319 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12320 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12321 *puDst = uDstOut;
12322}
12323
12324
12325IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12326{
12327 RTUINT256U const uSrc2 = *puSrc2;
12328 RTUINT256U const uSrc1 = *puSrc1;
12329 ASMCompilerBarrier();
12330 RTUINT256U uDstOut;
12331 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12332 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12333 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12334 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12335 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12336 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12337 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12338 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12339
12340 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12341 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12342 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12343 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12344 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12345 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12346 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12347 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12348 *puDst = uDstOut;
12349}
12350
12351
12352/*
12353 * PACKUSDW - signed dwords -> unsigned words
12354 */
12355#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12356 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12357 ? (uint16_t)(a_iDword) \
12358 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12359
12360#ifdef IEM_WITHOUT_ASSEMBLY
12361IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12362{
12363 RTUINT128U const uSrc2 = *puSrc;
12364 RTUINT128U const uSrc1 = *puDst;
12365 ASMCompilerBarrier();
12366 RTUINT128U uDstOut;
12367 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12368 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12369 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12370 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12371 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12372 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12373 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12374 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12375 *puDst = uDstOut;
12376}
12377#endif
12378
12379IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12380{
12381 RTUINT128U const uSrc2 = *puSrc2;
12382 RTUINT128U const uSrc1 = *puSrc1;
12383 ASMCompilerBarrier();
12384 RTUINT128U uDstOut;
12385 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12386 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12387 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12388 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12389 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12390 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12391 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12392 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12393 *puDst = uDstOut;
12394}
12395
12396
12397IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12398{
12399 RTUINT256U const uSrc2 = *puSrc2;
12400 RTUINT256U const uSrc1 = *puSrc1;
12401 ASMCompilerBarrier();
12402 RTUINT256U uDstOut;
12403 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12404 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12405 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12406 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12407 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12408 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12409 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12410 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12411
12412 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12413 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12414 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12415 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12416 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12417 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12418 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12419 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12420 *puDst = uDstOut;
12421}
12422
12423
12424/*
12425 * [V]PABSB / [V]PABSW / [V]PABSD
12426 */
12427
12428IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12429{
12430 RTUINT64U const uSrc = { *puSrc };
12431 RTUINT64U uDstOut = { 0 };
12432
12433 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12434 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12435 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12436 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12437 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12438 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12439 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12440 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12441 *puDst = uDstOut.u;
12442 RT_NOREF(pFpuState);
12443}
12444
12445
12446IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12447{
12448 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12449 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12450 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12451 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12452 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12453 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12454 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12455 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12456 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12457 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12458 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12459 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12460 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12461 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12462 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12463 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12464 RT_NOREF(pFpuState);
12465}
12466
12467
12468IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12469{
12470 RTUINT64U const uSrc = { *puSrc };
12471 RTUINT64U uDstOut = { 0 };
12472
12473 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12474 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12475 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12476 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12477 *puDst = uDstOut.u;
12478 RT_NOREF(pFpuState);
12479}
12480
12481
12482IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12483{
12484 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12485 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12486 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12487 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12488 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12489 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12490 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12491 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12492 RT_NOREF(pFpuState);
12493}
12494
12495
12496IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12497{
12498 RTUINT64U const uSrc = { *puSrc };
12499 RTUINT64U uDstOut = { 0 };
12500
12501 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12502 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12503 *puDst = uDstOut.u;
12504 RT_NOREF(pFpuState);
12505}
12506
12507
12508IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12509{
12510 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12511 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12512 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12513 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12514 RT_NOREF(pFpuState);
12515}
12516
12517
12518IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12519{
12520 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12521 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12522 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12523 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12524 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12525 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12526 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12527 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12528 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12529 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12530 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12531 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12532 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12533 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12534 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12535 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12536}
12537
12538
12539IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12540{
12541 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12542 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12543 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12544 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12545 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12546 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12547 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12548 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12549 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12550 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12551 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12552 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12553 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12554 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12555 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12556 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12557 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12558 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12559 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12560 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12561 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12562 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12563 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12564 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12565 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12566 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12567 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12568 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12569 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12570 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12571 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12572 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12573}
12574
12575
12576IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12577{
12578 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12579 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12580 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12581 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12582 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12583 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12584 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12585 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12586}
12587
12588
12589IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12590{
12591 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12592 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12593 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12594 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12595 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12596 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12597 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12598 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12599 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12600 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12601 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12602 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12603 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12604 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12605 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12606 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12607}
12608
12609
12610IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12611{
12612 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12613 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12614 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12615 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12616}
12617
12618
12619IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12620{
12621 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12622 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12623 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12624 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12625 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12626 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12627 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12628 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12629}
12630
12631
12632/*
12633 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12634 */
12635IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12636{
12637 RTUINT64U uSrc1 = { *puDst };
12638 RTUINT64U uSrc2 = { *puSrc };
12639 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12640
12641 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12642 {
12643 if (uSrc2.ai8[i] < 0)
12644 uDst.ai8[i] = -uSrc1.ai8[i];
12645 else if (uSrc2.ai8[i] == 0)
12646 uDst.ai8[i] = 0;
12647 else /* uSrc2.ai8[i] > 0 */
12648 uDst.ai8[i] = uSrc1.ai8[i];
12649 }
12650
12651 *puDst = uDst.u;
12652 RT_NOREF(pFpuState);
12653}
12654
12655
12656IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12657{
12658 RTUINT128U uSrc1 = *puDst;
12659
12660 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12661 {
12662 if (puSrc->ai8[i] < 0)
12663 puDst->ai8[i] = -uSrc1.ai8[i];
12664 else if (puSrc->ai8[i] == 0)
12665 puDst->ai8[i] = 0;
12666 else /* puSrc->ai8[i] > 0 */
12667 puDst->ai8[i] = uSrc1.ai8[i];
12668 }
12669
12670 RT_NOREF(pFpuState);
12671}
12672
12673
12674IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12675{
12676 RTUINT64U uSrc1 = { *puDst };
12677 RTUINT64U uSrc2 = { *puSrc };
12678 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12679
12680 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12681 {
12682 if (uSrc2.ai16[i] < 0)
12683 uDst.ai16[i] = -uSrc1.ai16[i];
12684 else if (uSrc2.ai16[i] == 0)
12685 uDst.ai16[i] = 0;
12686 else /* uSrc2.ai16[i] > 0 */
12687 uDst.ai16[i] = uSrc1.ai16[i];
12688 }
12689
12690 *puDst = uDst.u;
12691 RT_NOREF(pFpuState);
12692}
12693
12694
12695IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12696{
12697 RTUINT128U uSrc1 = *puDst;
12698
12699 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12700 {
12701 if (puSrc->ai16[i] < 0)
12702 puDst->ai16[i] = -uSrc1.ai16[i];
12703 else if (puSrc->ai16[i] == 0)
12704 puDst->ai16[i] = 0;
12705 else /* puSrc->ai16[i] > 0 */
12706 puDst->ai16[i] = uSrc1.ai16[i];
12707 }
12708
12709 RT_NOREF(pFpuState);
12710}
12711
12712
12713IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12714{
12715 RTUINT64U uSrc1 = { *puDst };
12716 RTUINT64U uSrc2 = { *puSrc };
12717 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12718
12719 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12720 {
12721 if (uSrc2.ai32[i] < 0)
12722 uDst.ai32[i] = -uSrc1.ai32[i];
12723 else if (uSrc2.ai32[i] == 0)
12724 uDst.ai32[i] = 0;
12725 else /* uSrc2.ai32[i] > 0 */
12726 uDst.ai32[i] = uSrc1.ai32[i];
12727 }
12728
12729 *puDst = uDst.u;
12730 RT_NOREF(pFpuState);
12731}
12732
12733
12734IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12735{
12736 RTUINT128U uSrc1 = *puDst;
12737
12738 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12739 {
12740 if (puSrc->ai32[i] < 0)
12741 puDst->ai32[i] = -uSrc1.ai32[i];
12742 else if (puSrc->ai32[i] == 0)
12743 puDst->ai32[i] = 0;
12744 else /* puSrc->ai32[i] > 0 */
12745 puDst->ai32[i] = uSrc1.ai32[i];
12746 }
12747
12748 RT_NOREF(pFpuState);
12749}
12750
12751
12752IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12753{
12754 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12755 {
12756 if (puSrc2->ai8[i] < 0)
12757 puDst->ai8[i] = -puSrc1->ai8[i];
12758 else if (puSrc2->ai8[i] == 0)
12759 puDst->ai8[i] = 0;
12760 else /* puSrc2->ai8[i] > 0 */
12761 puDst->ai8[i] = puSrc1->ai8[i];
12762 }
12763}
12764
12765
12766IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12767{
12768 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12769 {
12770 if (puSrc2->ai8[i] < 0)
12771 puDst->ai8[i] = -puSrc1->ai8[i];
12772 else if (puSrc2->ai8[i] == 0)
12773 puDst->ai8[i] = 0;
12774 else /* puSrc2->ai8[i] > 0 */
12775 puDst->ai8[i] = puSrc1->ai8[i];
12776 }
12777}
12778
12779
12780IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12781{
12782 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12783 {
12784 if (puSrc2->ai16[i] < 0)
12785 puDst->ai16[i] = -puSrc1->ai16[i];
12786 else if (puSrc2->ai16[i] == 0)
12787 puDst->ai16[i] = 0;
12788 else /* puSrc2->ai16[i] > 0 */
12789 puDst->ai16[i] = puSrc1->ai16[i];
12790 }
12791}
12792
12793
12794IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12795{
12796 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12797 {
12798 if (puSrc2->ai16[i] < 0)
12799 puDst->ai16[i] = -puSrc1->ai16[i];
12800 else if (puSrc2->ai16[i] == 0)
12801 puDst->ai16[i] = 0;
12802 else /* puSrc2->ai16[i] > 0 */
12803 puDst->ai16[i] = puSrc1->ai16[i];
12804 }
12805}
12806
12807
12808IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12809{
12810 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12811 {
12812 if (puSrc2->ai32[i] < 0)
12813 puDst->ai32[i] = -puSrc1->ai32[i];
12814 else if (puSrc2->ai32[i] == 0)
12815 puDst->ai32[i] = 0;
12816 else /* puSrc2->ai32[i] > 0 */
12817 puDst->ai32[i] = puSrc1->ai32[i];
12818 }
12819}
12820
12821
12822IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12823{
12824 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12825 {
12826 if (puSrc2->ai32[i] < 0)
12827 puDst->ai32[i] = -puSrc1->ai32[i];
12828 else if (puSrc2->ai32[i] == 0)
12829 puDst->ai32[i] = 0;
12830 else /* puSrc2->ai32[i] > 0 */
12831 puDst->ai32[i] = puSrc1->ai32[i];
12832 }
12833}
12834
12835
12836/*
12837 * PHADDW / VPHADDW / PHADDD / VPHADDD
12838 */
12839IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12840{
12841 RTUINT64U uSrc1 = { *puDst };
12842 RTUINT64U uSrc2 = { *puSrc };
12843 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12844
12845 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12846 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12847 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
12848 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
12849 *puDst = uDst.u;
12850 RT_NOREF(pFpuState);
12851}
12852
12853
12854IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12855{
12856 RTUINT128U uSrc1 = *puDst;
12857
12858 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12859 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12860 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
12861 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
12862
12863 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
12864 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
12865 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
12866 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
12867 RT_NOREF(pFpuState);
12868}
12869
12870
12871IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12872{
12873 RTUINT64U uSrc1 = { *puDst };
12874 RTUINT64U uSrc2 = { *puSrc };
12875 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12876
12877 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12878 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
12879 *puDst = uDst.u;
12880 RT_NOREF(pFpuState);
12881}
12882
12883
12884IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12885{
12886 RTUINT128U uSrc1 = *puDst;
12887
12888 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12889 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
12890
12891 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
12892 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
12893 RT_NOREF(pFpuState);
12894}
12895
12896
12897IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12898{
12899 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12900
12901 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
12902 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
12903 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
12904 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
12905
12906 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
12907 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
12908 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
12909 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
12910
12911 puDst->au64[0] = uDst.au64[0];
12912 puDst->au64[1] = uDst.au64[1];
12913}
12914
12915
12916IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12917{
12918 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12919
12920 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
12921 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
12922 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
12923 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
12924 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
12925 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
12926 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
12927 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
12928
12929 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
12930 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
12931 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
12932 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
12933 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
12934 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
12935 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
12936 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
12937
12938 puDst->au64[0] = uDst.au64[0];
12939 puDst->au64[1] = uDst.au64[1];
12940 puDst->au64[2] = uDst.au64[2];
12941 puDst->au64[3] = uDst.au64[3];
12942}
12943
12944
12945IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12946{
12947 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12948
12949 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
12950 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
12951
12952 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
12953 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
12954
12955 puDst->au64[0] = uDst.au64[0];
12956 puDst->au64[1] = uDst.au64[1];
12957}
12958
12959
12960IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12961{
12962 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12963
12964 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
12965 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
12966 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
12967 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
12968
12969 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
12970 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
12971 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
12972 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
12973
12974 puDst->au64[0] = uDst.au64[0];
12975 puDst->au64[1] = uDst.au64[1];
12976 puDst->au64[2] = uDst.au64[2];
12977 puDst->au64[3] = uDst.au64[3];
12978}
12979
12980
12981/*
12982 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
12983 */
12984IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12985{
12986 RTUINT64U uSrc1 = { *puDst };
12987 RTUINT64U uSrc2 = { *puSrc };
12988 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12989
12990 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
12991 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
12992 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
12993 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
12994 *puDst = uDst.u;
12995 RT_NOREF(pFpuState);
12996}
12997
12998
12999IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13000{
13001 RTUINT128U uSrc1 = *puDst;
13002
13003 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13004 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13005 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13006 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13007
13008 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13009 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13010 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13011 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13012 RT_NOREF(pFpuState);
13013}
13014
13015
13016IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13017{
13018 RTUINT64U uSrc1 = { *puDst };
13019 RTUINT64U uSrc2 = { *puSrc };
13020 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13021
13022 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13023 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13024 *puDst = uDst.u;
13025 RT_NOREF(pFpuState);
13026}
13027
13028
13029IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13030{
13031 RTUINT128U uSrc1 = *puDst;
13032
13033 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13034 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13035
13036 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13037 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13038 RT_NOREF(pFpuState);
13039}
13040
13041
13042IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13043{
13044 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13045
13046 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13047 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13048 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13049 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13050
13051 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13052 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13053 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13054 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13055
13056 puDst->au64[0] = uDst.au64[0];
13057 puDst->au64[1] = uDst.au64[1];
13058}
13059
13060
13061IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13062{
13063 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13064
13065 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13066 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13067 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13068 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13069 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13070 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13071 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13072 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13073
13074 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13075 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13076 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13077 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13078 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13079 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13080 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13081 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13082
13083 puDst->au64[0] = uDst.au64[0];
13084 puDst->au64[1] = uDst.au64[1];
13085 puDst->au64[2] = uDst.au64[2];
13086 puDst->au64[3] = uDst.au64[3];
13087}
13088
13089
13090IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13091{
13092 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13093
13094 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13095 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13096
13097 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13098 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13099
13100 puDst->au64[0] = uDst.au64[0];
13101 puDst->au64[1] = uDst.au64[1];
13102}
13103
13104
13105IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13106{
13107 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13108
13109 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13110 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13111 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13112 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13113
13114 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13115 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13116 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13117 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13118
13119 puDst->au64[0] = uDst.au64[0];
13120 puDst->au64[1] = uDst.au64[1];
13121 puDst->au64[2] = uDst.au64[2];
13122 puDst->au64[3] = uDst.au64[3];
13123}
13124
13125
13126/*
13127 * PHADDSW / VPHADDSW
13128 */
13129IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13130{
13131 RTUINT64U uSrc1 = { *puDst };
13132 RTUINT64U uSrc2 = { *puSrc };
13133 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13134
13135 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13136 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13137 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13138 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13139 *puDst = uDst.u;
13140 RT_NOREF(pFpuState);
13141}
13142
13143
13144IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13145{
13146 RTUINT128U uSrc1 = *puDst;
13147
13148 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13149 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13150 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13151 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13152
13153 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13154 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13155 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13156 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13157 RT_NOREF(pFpuState);
13158}
13159
13160
13161IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13162{
13163 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13164
13165 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13166 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13167 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13168 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13169
13170 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13171 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13172 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13173 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13174
13175 puDst->au64[0] = uDst.au64[0];
13176 puDst->au64[1] = uDst.au64[1];
13177}
13178
13179
13180IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13181{
13182 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13183
13184 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13185 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13186 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13187 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13188 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13189 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13190 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13191 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13192
13193 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13194 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13195 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13196 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13197 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13198 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13199 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13200 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13201
13202 puDst->au64[0] = uDst.au64[0];
13203 puDst->au64[1] = uDst.au64[1];
13204 puDst->au64[2] = uDst.au64[2];
13205 puDst->au64[3] = uDst.au64[3];
13206}
13207
13208
13209/*
13210 * PHSUBSW / VPHSUBSW
13211 */
13212IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13213{
13214 RTUINT64U uSrc1 = { *puDst };
13215 RTUINT64U uSrc2 = { *puSrc };
13216 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13217
13218 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13219 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13220 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13221 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13222 *puDst = uDst.u;
13223 RT_NOREF(pFpuState);
13224}
13225
13226
13227IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13228{
13229 RTUINT128U uSrc1 = *puDst;
13230
13231 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13232 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13233 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13234 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13235
13236 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13237 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13238 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13239 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13240 RT_NOREF(pFpuState);
13241}
13242
13243
13244IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13245{
13246 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13247
13248 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13249 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13250 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13251 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13252
13253 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13254 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13255 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13256 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13257
13258 puDst->au64[0] = uDst.au64[0];
13259 puDst->au64[1] = uDst.au64[1];
13260}
13261
13262
13263IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13264{
13265 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13266
13267 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13268 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13269 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13270 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13271 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13272 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13273 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13274 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13275
13276 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13277 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13278 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13279 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13280 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13281 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13282 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13283 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13284
13285 puDst->au64[0] = uDst.au64[0];
13286 puDst->au64[1] = uDst.au64[1];
13287 puDst->au64[2] = uDst.au64[2];
13288 puDst->au64[3] = uDst.au64[3];
13289}
13290
13291
13292/*
13293 * PMADDUBSW / VPMADDUBSW
13294 */
13295IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13296{
13297 RTUINT64U uSrc1 = { *puDst };
13298 RTUINT64U uSrc2 = { *puSrc };
13299 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13300
13301 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13302 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13303 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13304 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13305 *puDst = uDst.u;
13306 RT_NOREF(pFpuState);
13307}
13308
13309
13310IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13311{
13312 RTUINT128U uSrc1 = *puDst;
13313
13314 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13315 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13316 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13317 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13318 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13319 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13320 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13321 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13322 RT_NOREF(pFpuState);
13323}
13324
13325
13326IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13327{
13328 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13329
13330 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13331 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13332 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13333 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13334 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13335 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13336 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13337 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13338
13339 puDst->au64[0] = uDst.au64[0];
13340 puDst->au64[1] = uDst.au64[1];
13341}
13342
13343
13344IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13345{
13346 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13347
13348 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13349 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13350 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13351 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13352 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13353 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13354 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13355 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13356 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13357 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13358 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13359 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13360 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13361 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13362 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13363 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13364
13365 puDst->au64[0] = uDst.au64[0];
13366 puDst->au64[1] = uDst.au64[1];
13367 puDst->au64[2] = uDst.au64[2];
13368 puDst->au64[3] = uDst.au64[3];
13369}
13370
13371
13372/*
13373 * PMULHRSW / VPMULHRSW
13374 */
13375#define DO_PMULHRSW(a_Src1, a_Src2) \
13376 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13377
13378IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13379{
13380 RTUINT64U uSrc1 = { *puDst };
13381 RTUINT64U uSrc2 = { *puSrc };
13382 RTUINT64U uDst;
13383
13384 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13385 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13386 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13387 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13388 *puDst = uDst.u;
13389 RT_NOREF(pFpuState);
13390}
13391
13392
13393IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13394{
13395 RTUINT128U uSrc1 = *puDst;
13396
13397 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13398 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13399 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13400 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13401 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13402 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13403 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13404 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13405 RT_NOREF(pFpuState);
13406}
13407
13408
13409IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13410{
13411 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13412
13413 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13414 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13415 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13416 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13417 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13418 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13419 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13420 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13421
13422 puDst->au64[0] = uDst.au64[0];
13423 puDst->au64[1] = uDst.au64[1];
13424}
13425
13426
13427IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13428{
13429 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13430
13431 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13432 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13433 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13434 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13435 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13436 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13437 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13438 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13439 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13440 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13441 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13442 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13443 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13444 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13445 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13446 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13447
13448 puDst->au64[0] = uDst.au64[0];
13449 puDst->au64[1] = uDst.au64[1];
13450 puDst->au64[2] = uDst.au64[2];
13451 puDst->au64[3] = uDst.au64[3];
13452}
13453
13454
13455/*
13456 * PSADBW / VPSADBW
13457 */
13458#ifdef IEM_WITHOUT_ASSEMBLY
13459
13460IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13461{
13462 RTUINT64U uSrc1 = { *puDst };
13463 RTUINT64U uSrc2 = { *puSrc };
13464 RTUINT64U uDst;
13465 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13466 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13467 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13468 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13469 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13470 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13471 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13472 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13473
13474 uDst.au64[0] = 0;
13475 uDst.au16[0] = uSum;
13476 *puDst = uDst.u;
13477}
13478
13479
13480IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13481{
13482 RTUINT128U uSrc1 = *puDst;
13483
13484 puDst->au64[0] = 0;
13485 puDst->au64[1] = 0;
13486
13487 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13488 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13489 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13490 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13491 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13492 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13493 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13494 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13495 puDst->au16[0] = uSum;
13496
13497 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13498 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13499 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13500 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13501 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13502 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13503 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13504 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13505 puDst->au16[4] = uSum;
13506}
13507
13508#endif
13509
13510IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13511{
13512 RTUINT128U uSrc1 = *puSrc1;
13513 RTUINT128U uSrc2 = *puSrc2;
13514
13515 puDst->au64[0] = 0;
13516 puDst->au64[1] = 0;
13517
13518 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13519 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13520 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13521 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13522 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13523 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13524 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13525 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13526 puDst->au16[0] = uSum;
13527
13528 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13529 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13530 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13531 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13532 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13533 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13534 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13535 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13536 puDst->au16[4] = uSum;
13537}
13538
13539IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13540{
13541 RTUINT256U uSrc1 = *puSrc1;
13542 RTUINT256U uSrc2 = *puSrc2;
13543
13544 puDst->au64[0] = 0;
13545 puDst->au64[1] = 0;
13546 puDst->au64[2] = 0;
13547 puDst->au64[3] = 0;
13548
13549 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13550 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13551 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13552 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13553 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13554 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13555 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13556 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13557 puDst->au16[0] = uSum;
13558
13559 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13560 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13561 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13562 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13563 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13564 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13565 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13566 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13567 puDst->au16[4] = uSum;
13568
13569 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13570 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13571 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13572 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13573 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13574 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13575 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13576 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13577 puDst->au16[8] = uSum;
13578
13579 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13580 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13581 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13582 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13583 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13584 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13585 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13586 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13587 puDst->au16[12] = uSum;
13588}
13589
13590
13591/*
13592 * PMULDQ / VPMULDQ
13593 */
13594IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13595{
13596 RTUINT128U uSrc1 = *puDst;
13597
13598 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13599 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13600}
13601
13602IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13603{
13604 RTUINT128U uSrc1 = *puSrc1;
13605 RTUINT128U uSrc2 = *puSrc2;
13606
13607 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13608 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13609}
13610
13611IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13612{
13613 RTUINT256U uSrc1 = *puSrc1;
13614 RTUINT256U uSrc2 = *puSrc2;
13615
13616 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13617 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13618 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13619 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13620}
13621
13622
13623/*
13624 * PMULUDQ / VPMULUDQ
13625 */
13626#ifdef IEM_WITHOUT_ASSEMBLY
13627
13628IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13629{
13630 RTUINT64U uSrc1 = { *puDst };
13631 RTUINT64U uSrc2 = { *puSrc };
13632 ASMCompilerBarrier();
13633 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13634 RT_NOREF(pFpuState);
13635}
13636
13637
13638IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13639{
13640 RTUINT128U uSrc1 = *puDst;
13641 RTUINT128U uSrc2 = *puSrc;
13642 ASMCompilerBarrier();
13643 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13644 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13645 RT_NOREF(pFpuState);
13646}
13647
13648#endif
13649
13650IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13651{
13652 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13653 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13654 ASMCompilerBarrier();
13655 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13656 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13657}
13658
13659
13660IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13661{
13662 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13663 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13664 ASMCompilerBarrier();
13665 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13666 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13667 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13668 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13669}
13670
13671
13672/*
13673 * UNPCKLPS / VUNPCKLPS
13674 */
13675#ifdef IEM_WITHOUT_ASSEMBLY
13676IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13677{
13678 RTUINT128U uSrc1 = *puDst;
13679 RTUINT128U uSrc2 = *puSrc;
13680 ASMCompilerBarrier();
13681 puDst->au32[0] = uSrc1.au32[0];
13682 puDst->au32[1] = uSrc2.au32[0];
13683 puDst->au32[2] = uSrc1.au32[1];
13684 puDst->au32[3] = uSrc2.au32[1];
13685}
13686
13687#endif
13688
13689IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13690{
13691 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13692 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13693 ASMCompilerBarrier();
13694 puDst->au32[0] = uSrc1.au32[0];
13695 puDst->au32[1] = uSrc2.au32[0];
13696 puDst->au32[2] = uSrc1.au32[1];
13697 puDst->au32[3] = uSrc2.au32[1];
13698}
13699
13700
13701IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13702{
13703 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13704 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13705 ASMCompilerBarrier();
13706 puDst->au32[0] = uSrc1.au32[0];
13707 puDst->au32[1] = uSrc2.au32[0];
13708 puDst->au32[2] = uSrc1.au32[1];
13709 puDst->au32[3] = uSrc2.au32[1];
13710
13711 puDst->au32[4] = uSrc1.au32[4];
13712 puDst->au32[5] = uSrc2.au32[4];
13713 puDst->au32[6] = uSrc1.au32[5];
13714 puDst->au32[7] = uSrc2.au32[5];
13715}
13716
13717
13718/*
13719 * UNPCKLPD / VUNPCKLPD
13720 */
13721#ifdef IEM_WITHOUT_ASSEMBLY
13722IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13723{
13724 RTUINT128U uSrc1 = *puDst;
13725 RTUINT128U uSrc2 = *puSrc;
13726 ASMCompilerBarrier();
13727 puDst->au64[0] = uSrc1.au64[0];
13728 puDst->au64[1] = uSrc2.au64[0];
13729}
13730
13731#endif
13732
13733IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13734{
13735 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13736 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13737 ASMCompilerBarrier();
13738 puDst->au64[0] = uSrc1.au64[0];
13739 puDst->au64[1] = uSrc2.au64[0];
13740}
13741
13742
13743IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13744{
13745 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13746 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13747 ASMCompilerBarrier();
13748 puDst->au64[0] = uSrc1.au64[0];
13749 puDst->au64[1] = uSrc2.au64[0];
13750 puDst->au64[2] = uSrc1.au64[2];
13751 puDst->au64[3] = uSrc2.au64[2];
13752}
13753
13754
13755/*
13756 * UNPCKHPS / VUNPCKHPS
13757 */
13758#ifdef IEM_WITHOUT_ASSEMBLY
13759IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13760{
13761 RTUINT128U uSrc1 = *puDst;
13762 RTUINT128U uSrc2 = *puSrc;
13763 ASMCompilerBarrier();
13764 puDst->au32[0] = uSrc1.au32[2];
13765 puDst->au32[1] = uSrc2.au32[2];
13766 puDst->au32[2] = uSrc1.au32[3];
13767 puDst->au32[3] = uSrc2.au32[3];
13768}
13769
13770#endif
13771
13772IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13773{
13774 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13775 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13776 ASMCompilerBarrier();
13777 puDst->au32[0] = uSrc1.au32[2];
13778 puDst->au32[1] = uSrc2.au32[2];
13779 puDst->au32[2] = uSrc1.au32[3];
13780 puDst->au32[3] = uSrc2.au32[3];
13781}
13782
13783
13784IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13785{
13786 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13787 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13788 ASMCompilerBarrier();
13789 puDst->au32[0] = uSrc1.au32[2];
13790 puDst->au32[1] = uSrc2.au32[2];
13791 puDst->au32[2] = uSrc1.au32[3];
13792 puDst->au32[3] = uSrc2.au32[3];
13793
13794 puDst->au32[4] = uSrc1.au32[6];
13795 puDst->au32[5] = uSrc2.au32[6];
13796 puDst->au32[6] = uSrc1.au32[7];
13797 puDst->au32[7] = uSrc2.au32[7];
13798}
13799
13800
13801/*
13802 * UNPCKHPD / VUNPCKHPD
13803 */
13804#ifdef IEM_WITHOUT_ASSEMBLY
13805IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13806{
13807 RTUINT128U uSrc1 = *puDst;
13808 RTUINT128U uSrc2 = *puSrc;
13809 ASMCompilerBarrier();
13810 puDst->au64[0] = uSrc1.au64[1];
13811 puDst->au64[1] = uSrc2.au64[1];
13812}
13813
13814#endif
13815
13816IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13817{
13818 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13819 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13820 ASMCompilerBarrier();
13821 puDst->au64[0] = uSrc1.au64[1];
13822 puDst->au64[1] = uSrc2.au64[1];
13823}
13824
13825
13826IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13827{
13828 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13829 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13830 ASMCompilerBarrier();
13831 puDst->au64[0] = uSrc1.au64[1];
13832 puDst->au64[1] = uSrc2.au64[1];
13833 puDst->au64[2] = uSrc1.au64[3];
13834 puDst->au64[3] = uSrc2.au64[3];
13835}
13836
13837
13838/*
13839 * CRC32 (SEE 4.2).
13840 */
13841
13842IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
13843{
13844 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13845}
13846
13847
13848IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
13849{
13850 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13851}
13852
13853IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
13854{
13855 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13856}
13857
13858IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
13859{
13860 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13861}
13862
13863
13864/*
13865 * PTEST (SSE 4.1) - special as it output only EFLAGS.
13866 */
13867#ifdef IEM_WITHOUT_ASSEMBLY
13868IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
13869{
13870 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13871 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13872 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13873 fEfl |= X86_EFL_ZF;
13874 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13875 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13876 fEfl |= X86_EFL_CF;
13877 *pfEFlags = fEfl;
13878}
13879#endif
13880
13881IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
13882{
13883 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13884 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13885 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
13886 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
13887 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13888 fEfl |= X86_EFL_ZF;
13889 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13890 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
13891 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
13892 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13893 fEfl |= X86_EFL_CF;
13894 *pfEFlags = fEfl;
13895}
13896
13897
13898/*
13899 * PMOVSXBW / VPMOVSXBW
13900 */
13901IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13902{
13903 RTUINT64U uSrc1 = { uSrc };
13904 puDst->ai16[0] = uSrc1.ai8[0];
13905 puDst->ai16[1] = uSrc1.ai8[1];
13906 puDst->ai16[2] = uSrc1.ai8[2];
13907 puDst->ai16[3] = uSrc1.ai8[3];
13908 puDst->ai16[4] = uSrc1.ai8[4];
13909 puDst->ai16[5] = uSrc1.ai8[5];
13910 puDst->ai16[6] = uSrc1.ai8[6];
13911 puDst->ai16[7] = uSrc1.ai8[7];
13912}
13913
13914
13915IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13916{
13917 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13918 puDst->ai16[ 0] = uSrc1.ai8[ 0];
13919 puDst->ai16[ 1] = uSrc1.ai8[ 1];
13920 puDst->ai16[ 2] = uSrc1.ai8[ 2];
13921 puDst->ai16[ 3] = uSrc1.ai8[ 3];
13922 puDst->ai16[ 4] = uSrc1.ai8[ 4];
13923 puDst->ai16[ 5] = uSrc1.ai8[ 5];
13924 puDst->ai16[ 6] = uSrc1.ai8[ 6];
13925 puDst->ai16[ 7] = uSrc1.ai8[ 7];
13926 puDst->ai16[ 8] = uSrc1.ai8[ 8];
13927 puDst->ai16[ 9] = uSrc1.ai8[ 9];
13928 puDst->ai16[10] = uSrc1.ai8[10];
13929 puDst->ai16[11] = uSrc1.ai8[11];
13930 puDst->ai16[12] = uSrc1.ai8[12];
13931 puDst->ai16[13] = uSrc1.ai8[13];
13932 puDst->ai16[14] = uSrc1.ai8[14];
13933 puDst->ai16[15] = uSrc1.ai8[15];
13934}
13935
13936
13937/*
13938 * PMOVSXBD / VPMOVSXBD
13939 */
13940IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
13941{
13942 RTUINT32U uSrc1 = { uSrc };
13943 puDst->ai32[0] = uSrc1.ai8[0];
13944 puDst->ai32[1] = uSrc1.ai8[1];
13945 puDst->ai32[2] = uSrc1.ai8[2];
13946 puDst->ai32[3] = uSrc1.ai8[3];
13947}
13948
13949
13950IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13951{
13952 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13953 puDst->ai32[0] = uSrc1.ai8[0];
13954 puDst->ai32[1] = uSrc1.ai8[1];
13955 puDst->ai32[2] = uSrc1.ai8[2];
13956 puDst->ai32[3] = uSrc1.ai8[3];
13957 puDst->ai32[4] = uSrc1.ai8[4];
13958 puDst->ai32[5] = uSrc1.ai8[5];
13959 puDst->ai32[6] = uSrc1.ai8[6];
13960 puDst->ai32[7] = uSrc1.ai8[7];
13961}
13962
13963
13964/*
13965 * PMOVSXBQ / VPMOVSXBQ
13966 */
13967IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
13968{
13969 RTUINT16U uSrc1 = { uSrc };
13970 puDst->ai64[0] = uSrc1.ai8[0];
13971 puDst->ai64[1] = uSrc1.ai8[1];
13972}
13973
13974
13975IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13976{
13977 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13978 puDst->ai64[0] = uSrc1.ai8[0];
13979 puDst->ai64[1] = uSrc1.ai8[1];
13980 puDst->ai64[2] = uSrc1.ai8[2];
13981 puDst->ai64[3] = uSrc1.ai8[3];
13982}
13983
13984
13985/*
13986 * PMOVSXWD / VPMOVSXWD
13987 */
13988IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13989{
13990 RTUINT64U uSrc1 = { uSrc };
13991 puDst->ai32[0] = uSrc1.ai16[0];
13992 puDst->ai32[1] = uSrc1.ai16[1];
13993 puDst->ai32[2] = uSrc1.ai16[2];
13994 puDst->ai32[3] = uSrc1.ai16[3];
13995}
13996
13997
13998IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13999{
14000 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14001 puDst->ai32[0] = uSrc1.ai16[0];
14002 puDst->ai32[1] = uSrc1.ai16[1];
14003 puDst->ai32[2] = uSrc1.ai16[2];
14004 puDst->ai32[3] = uSrc1.ai16[3];
14005 puDst->ai32[4] = uSrc1.ai16[4];
14006 puDst->ai32[5] = uSrc1.ai16[5];
14007 puDst->ai32[6] = uSrc1.ai16[6];
14008 puDst->ai32[7] = uSrc1.ai16[7];
14009}
14010
14011
14012/*
14013 * PMOVSXWQ / VPMOVSXWQ
14014 */
14015IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14016{
14017 RTUINT32U uSrc1 = { uSrc };
14018 puDst->ai64[0] = uSrc1.ai16[0];
14019 puDst->ai64[1] = uSrc1.ai16[1];
14020}
14021
14022
14023IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14024{
14025 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14026 puDst->ai64[0] = uSrc1.ai16[0];
14027 puDst->ai64[1] = uSrc1.ai16[1];
14028 puDst->ai64[2] = uSrc1.ai16[2];
14029 puDst->ai64[3] = uSrc1.ai16[3];
14030}
14031
14032
14033/*
14034 * PMOVSXDQ / VPMOVSXDQ
14035 */
14036IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14037{
14038 RTUINT64U uSrc1 = { uSrc };
14039 puDst->ai64[0] = uSrc1.ai32[0];
14040 puDst->ai64[1] = uSrc1.ai32[1];
14041}
14042
14043
14044IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14045{
14046 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14047 puDst->ai64[0] = uSrc1.ai32[0];
14048 puDst->ai64[1] = uSrc1.ai32[1];
14049 puDst->ai64[2] = uSrc1.ai32[2];
14050 puDst->ai64[3] = uSrc1.ai32[3];
14051}
14052
14053
14054/*
14055 * PMOVZXBW / VPMOVZXBW
14056 */
14057IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14058{
14059 RTUINT64U uSrc1 = { uSrc };
14060 puDst->au16[0] = uSrc1.au8[0];
14061 puDst->au16[1] = uSrc1.au8[1];
14062 puDst->au16[2] = uSrc1.au8[2];
14063 puDst->au16[3] = uSrc1.au8[3];
14064 puDst->au16[4] = uSrc1.au8[4];
14065 puDst->au16[5] = uSrc1.au8[5];
14066 puDst->au16[6] = uSrc1.au8[6];
14067 puDst->au16[7] = uSrc1.au8[7];
14068}
14069
14070
14071IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14072{
14073 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14074 puDst->au16[ 0] = uSrc1.au8[ 0];
14075 puDst->au16[ 1] = uSrc1.au8[ 1];
14076 puDst->au16[ 2] = uSrc1.au8[ 2];
14077 puDst->au16[ 3] = uSrc1.au8[ 3];
14078 puDst->au16[ 4] = uSrc1.au8[ 4];
14079 puDst->au16[ 5] = uSrc1.au8[ 5];
14080 puDst->au16[ 6] = uSrc1.au8[ 6];
14081 puDst->au16[ 7] = uSrc1.au8[ 7];
14082 puDst->au16[ 8] = uSrc1.au8[ 8];
14083 puDst->au16[ 9] = uSrc1.au8[ 9];
14084 puDst->au16[10] = uSrc1.au8[10];
14085 puDst->au16[11] = uSrc1.au8[11];
14086 puDst->au16[12] = uSrc1.au8[12];
14087 puDst->au16[13] = uSrc1.au8[13];
14088 puDst->au16[14] = uSrc1.au8[14];
14089 puDst->au16[15] = uSrc1.au8[15];
14090}
14091
14092
14093/*
14094 * PMOVZXBD / VPMOVZXBD
14095 */
14096IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14097{
14098 RTUINT32U uSrc1 = { uSrc };
14099 puDst->au32[0] = uSrc1.au8[0];
14100 puDst->au32[1] = uSrc1.au8[1];
14101 puDst->au32[2] = uSrc1.au8[2];
14102 puDst->au32[3] = uSrc1.au8[3];
14103}
14104
14105
14106IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14107{
14108 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14109 puDst->au32[0] = uSrc1.au8[0];
14110 puDst->au32[1] = uSrc1.au8[1];
14111 puDst->au32[2] = uSrc1.au8[2];
14112 puDst->au32[3] = uSrc1.au8[3];
14113 puDst->au32[4] = uSrc1.au8[4];
14114 puDst->au32[5] = uSrc1.au8[5];
14115 puDst->au32[6] = uSrc1.au8[6];
14116 puDst->au32[7] = uSrc1.au8[7];
14117}
14118
14119
14120/*
14121 * PMOVZXBQ / VPMOVZXBQ
14122 */
14123IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14124{
14125 RTUINT16U uSrc1 = { uSrc };
14126 puDst->au64[0] = uSrc1.au8[0];
14127 puDst->au64[1] = uSrc1.au8[1];
14128}
14129
14130
14131IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14132{
14133 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14134 puDst->au64[0] = uSrc1.au8[0];
14135 puDst->au64[1] = uSrc1.au8[1];
14136 puDst->au64[2] = uSrc1.au8[2];
14137 puDst->au64[3] = uSrc1.au8[3];
14138}
14139
14140
14141/*
14142 * PMOVZXWD / VPMOVZXWD
14143 */
14144IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14145{
14146 RTUINT64U uSrc1 = { uSrc };
14147 puDst->au32[0] = uSrc1.au16[0];
14148 puDst->au32[1] = uSrc1.au16[1];
14149 puDst->au32[2] = uSrc1.au16[2];
14150 puDst->au32[3] = uSrc1.au16[3];
14151}
14152
14153
14154IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14155{
14156 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14157 puDst->au32[0] = uSrc1.au16[0];
14158 puDst->au32[1] = uSrc1.au16[1];
14159 puDst->au32[2] = uSrc1.au16[2];
14160 puDst->au32[3] = uSrc1.au16[3];
14161 puDst->au32[4] = uSrc1.au16[4];
14162 puDst->au32[5] = uSrc1.au16[5];
14163 puDst->au32[6] = uSrc1.au16[6];
14164 puDst->au32[7] = uSrc1.au16[7];
14165}
14166
14167
14168/*
14169 * PMOVZXWQ / VPMOVZXWQ
14170 */
14171IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14172{
14173 RTUINT32U uSrc1 = { uSrc };
14174 puDst->au64[0] = uSrc1.au16[0];
14175 puDst->au64[1] = uSrc1.au16[1];
14176}
14177
14178
14179IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14180{
14181 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14182 puDst->au64[0] = uSrc1.au16[0];
14183 puDst->au64[1] = uSrc1.au16[1];
14184 puDst->au64[2] = uSrc1.au16[2];
14185 puDst->au64[3] = uSrc1.au16[3];
14186}
14187
14188
14189/*
14190 * PMOVZXDQ / VPMOVZXDQ
14191 */
14192IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14193{
14194 RTUINT64U uSrc1 = { uSrc };
14195 puDst->au64[0] = uSrc1.au32[0];
14196 puDst->au64[1] = uSrc1.au32[1];
14197}
14198
14199
14200IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14201{
14202 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14203 puDst->au64[0] = uSrc1.au32[0];
14204 puDst->au64[1] = uSrc1.au32[1];
14205 puDst->au64[2] = uSrc1.au32[2];
14206 puDst->au64[3] = uSrc1.au32[3];
14207}
14208
14209
14210#ifdef IEM_WITHOUT_ASSEMBLY
14211/**
14212 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14213 * the SoftFloat 32-bit floating point format (float32_t).
14214 *
14215 * This is only a structure format conversion, nothing else.
14216 */
14217DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14218{
14219 float32_t Tmp;
14220 Tmp.v = pr32Val->u;
14221 return Tmp;
14222}
14223
14224
14225/**
14226 * Converts from SoftFloat 32-bit floating point format (float32_t)
14227 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14228 *
14229 * This is only a structure format conversion, nothing else.
14230 */
14231DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14232{
14233 pr32Dst->u = r32XSrc.v;
14234 return pr32Dst;
14235}
14236
14237
14238/**
14239 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14240 * the SoftFloat 64-bit floating point format (float64_t).
14241 *
14242 * This is only a structure format conversion, nothing else.
14243 */
14244DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14245{
14246 float64_t Tmp;
14247 Tmp.v = pr64Val->u;
14248 return Tmp;
14249}
14250
14251
14252/**
14253 * Converts from SoftFloat 64-bit floating point format (float64_t)
14254 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14255 *
14256 * This is only a structure format conversion, nothing else.
14257 */
14258DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14259{
14260 pr64Dst->u = r64XSrc.v;
14261 return pr64Dst;
14262}
14263
14264
14265/** Initializer for the SoftFloat state structure. */
14266# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14267 { \
14268 softfloat_tininess_afterRounding, \
14269 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14270 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14271 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14272 : (uint8_t)softfloat_round_minMag, \
14273 0, \
14274 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14275 32 /* Rounding precision, not relevant for SIMD. */ \
14276 }
14277
14278
14279/**
14280 * Helper for transfering exception to MXCSR and setting the result value
14281 * accordingly.
14282 *
14283 * @returns Updated MXCSR.
14284 * @param pSoftState The SoftFloat state following the operation.
14285 * @param r32Result The result of the SoftFloat operation.
14286 * @param pr32Result Where to store the result for IEM.
14287 * @param fMxcsr The original MXCSR value.
14288 */
14289DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14290 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14291{
14292 iemFpSoftF32ToIprt(pr32Result, r32Result);
14293
14294 uint8_t fXcpt = pSoftState->exceptionFlags;
14295 if ( (fMxcsr & X86_MXCSR_FZ)
14296 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14297 {
14298 /* Underflow masked and flush to zero is set. */
14299 pr32Result->s.uFraction = 0;
14300 pr32Result->s.uExponent = 0;
14301 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14302 }
14303
14304 /* If DAZ is set \#DE is never set. */
14305 if ( fMxcsr & X86_MXCSR_DAZ
14306 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14307 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14308 fXcpt &= ~X86_MXCSR_DE;
14309
14310 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14311}
14312
14313
14314/**
14315 * Helper for transfering exception to MXCSR and setting the result value
14316 * accordingly - ignores Flush-to-Zero.
14317 *
14318 * @returns Updated MXCSR.
14319 * @param pSoftState The SoftFloat state following the operation.
14320 * @param r32Result The result of the SoftFloat operation.
14321 * @param pr32Result Where to store the result for IEM.
14322 * @param fMxcsr The original MXCSR value.
14323 */
14324DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14325 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14326{
14327 iemFpSoftF32ToIprt(pr32Result, r32Result);
14328
14329 uint8_t fXcpt = pSoftState->exceptionFlags;
14330 /* If DAZ is set \#DE is never set. */
14331 if ( fMxcsr & X86_MXCSR_DAZ
14332 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14333 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14334 fXcpt &= ~X86_MXCSR_DE;
14335
14336 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14337}
14338
14339
14340/**
14341 * Helper for transfering exception to MXCSR and setting the result value
14342 * accordingly.
14343 *
14344 * @returns Updated MXCSR.
14345 * @param pSoftState The SoftFloat state following the operation.
14346 * @param r64Result The result of the SoftFloat operation.
14347 * @param pr64Result Where to store the result for IEM.
14348 * @param fMxcsr The original MXCSR value.
14349 */
14350DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14351 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14352{
14353 iemFpSoftF64ToIprt(pr64Result, r64Result);
14354 uint8_t fXcpt = pSoftState->exceptionFlags;
14355 if ( (fMxcsr & X86_MXCSR_FZ)
14356 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14357 {
14358 /* Underflow masked and flush to zero is set. */
14359 iemFpSoftF64ToIprt(pr64Result, r64Result);
14360 pr64Result->s.uFractionHigh = 0;
14361 pr64Result->s.uFractionLow = 0;
14362 pr64Result->s.uExponent = 0;
14363 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14364 }
14365
14366 /* If DAZ is set \#DE is never set. */
14367 if ( fMxcsr & X86_MXCSR_DAZ
14368 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14369 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14370 fXcpt &= ~X86_MXCSR_DE;
14371
14372 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14373}
14374
14375
14376/**
14377 * Helper for transfering exception to MXCSR and setting the result value
14378 * accordingly - ignores Flush-to-Zero.
14379 *
14380 * @returns Updated MXCSR.
14381 * @param pSoftState The SoftFloat state following the operation.
14382 * @param r64Result The result of the SoftFloat operation.
14383 * @param pr64Result Where to store the result for IEM.
14384 * @param fMxcsr The original MXCSR value.
14385 */
14386DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14387 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14388{
14389 iemFpSoftF64ToIprt(pr64Result, r64Result);
14390
14391 uint8_t fXcpt = pSoftState->exceptionFlags;
14392 /* If DAZ is set \#DE is never set. */
14393 if ( fMxcsr & X86_MXCSR_DAZ
14394 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14395 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14396 fXcpt &= ~X86_MXCSR_DE;
14397
14398 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14399}
14400
14401
14402/**
14403 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14404 * in MXCSR into account.
14405 *
14406 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14407 * @param pr32Val Where to store the result.
14408 * @param fMxcsr The input MXCSR value.
14409 * @param pr32Src The value to use.
14410 */
14411DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14412{
14413 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14414 {
14415 if (fMxcsr & X86_MXCSR_DAZ)
14416 {
14417 /* De-normals are changed to 0. */
14418 pr32Val->s.fSign = pr32Src->s.fSign;
14419 pr32Val->s.uFraction = 0;
14420 pr32Val->s.uExponent = 0;
14421 return 0;
14422 }
14423
14424 *pr32Val = *pr32Src;
14425 return X86_MXCSR_DE;
14426 }
14427
14428 *pr32Val = *pr32Src;
14429 return 0;
14430}
14431
14432
14433/**
14434 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14435 * in MXCSR into account.
14436 *
14437 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14438 * @param pr64Val Where to store the result.
14439 * @param fMxcsr The input MXCSR value.
14440 * @param pr64Src The value to use.
14441 */
14442DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14443{
14444 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14445 {
14446 if (fMxcsr & X86_MXCSR_DAZ)
14447 {
14448 /* De-normals are changed to 0. */
14449 pr64Val->s64.fSign = pr64Src->s.fSign;
14450 pr64Val->s64.uFraction = 0;
14451 pr64Val->s64.uExponent = 0;
14452 return 0;
14453 }
14454
14455 *pr64Val = *pr64Src;
14456 return X86_MXCSR_DE;
14457 }
14458
14459 *pr64Val = *pr64Src;
14460 return 0;
14461}
14462
14463
14464/**
14465 * Validates the given input operands returning whether the operation can continue or whether one
14466 * of the source operands contains a NaN value, setting the output accordingly.
14467 *
14468 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14469 * @param pr32Res Where to store the result in case the operation can't continue.
14470 * @param pr32Val1 The first input operand.
14471 * @param pr32Val2 The second input operand.
14472 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14473 */
14474DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14475{
14476 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14477 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14478 if (cSNan + cQNan == 2)
14479 {
14480 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14481 *pr32Res = *pr32Val1;
14482 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14483 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14484 return true;
14485 }
14486 else if (cSNan)
14487 {
14488 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14489 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14490 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14491 *pfMxcsr |= X86_MXCSR_IE;
14492 return true;
14493 }
14494 else if (cQNan)
14495 {
14496 /* The QNan operand is placed into the result. */
14497 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14498 return true;
14499 }
14500
14501 Assert(!cQNan && !cSNan);
14502 return false;
14503}
14504
14505
14506/**
14507 * Validates the given double precision input operands returning whether the operation can continue or whether one
14508 * of the source operands contains a NaN value, setting the output accordingly.
14509 *
14510 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14511 * @param pr64Res Where to store the result in case the operation can't continue.
14512 * @param pr64Val1 The first input operand.
14513 * @param pr64Val2 The second input operand.
14514 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14515 */
14516DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14517{
14518 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14519 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14520 if (cSNan + cQNan == 2)
14521 {
14522 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14523 *pr64Res = *pr64Val1;
14524 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14525 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14526 return true;
14527 }
14528 else if (cSNan)
14529 {
14530 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14531 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14532 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14533 *pfMxcsr |= X86_MXCSR_IE;
14534 return true;
14535 }
14536 else if (cQNan)
14537 {
14538 /* The QNan operand is placed into the result. */
14539 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14540 return true;
14541 }
14542
14543 Assert(!cQNan && !cSNan);
14544 return false;
14545}
14546
14547
14548/**
14549 * Validates the given single input operand returning whether the operation can continue or whether
14550 * contains a NaN value, setting the output accordingly.
14551 *
14552 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14553 * @param pr32Res Where to store the result in case the operation can't continue.
14554 * @param pr32Val The input operand.
14555 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14556 */
14557DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14558{
14559 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14560 {
14561 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14562 *pr32Res = *pr32Val;
14563 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14564 *pfMxcsr |= X86_MXCSR_IE;
14565 return true;
14566 }
14567 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14568 {
14569 /* The QNan operand is placed into the result. */
14570 *pr32Res = *pr32Val;
14571 return true;
14572 }
14573
14574 return false;
14575}
14576
14577
14578/**
14579 * Validates the given double input operand returning whether the operation can continue or whether
14580 * contains a NaN value, setting the output accordingly.
14581 *
14582 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14583 * @param pr64Res Where to store the result in case the operation can't continue.
14584 * @param pr64Val The input operand.
14585 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14586 */
14587DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14588{
14589 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14590 {
14591 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14592 *pr64Res = *pr64Val;
14593 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14594 *pfMxcsr |= X86_MXCSR_IE;
14595 return true;
14596 }
14597 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14598 {
14599 /* The QNan operand is placed into the result. */
14600 *pr64Res = *pr64Val;
14601 return true;
14602 }
14603
14604 return false;
14605}
14606#endif
14607
14608
14609/**
14610 * ADDPS
14611 */
14612#ifdef IEM_WITHOUT_ASSEMBLY
14613static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14614{
14615 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14616 return fMxcsr;
14617
14618 RTFLOAT32U r32Src1, r32Src2;
14619 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14620 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14621 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14622 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14623 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14624}
14625
14626
14627IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14628{
14629 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14630 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14631 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14632 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14633}
14634#endif
14635
14636
14637/**
14638 * ADDSS
14639 */
14640#ifdef IEM_WITHOUT_ASSEMBLY
14641IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14642{
14643 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14644 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14645 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14646 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14647}
14648#endif
14649
14650
14651/**
14652 * ADDPD
14653 */
14654#ifdef IEM_WITHOUT_ASSEMBLY
14655static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14656{
14657 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14658 return fMxcsr;
14659
14660 RTFLOAT64U r64Src1, r64Src2;
14661 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14662 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14663 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14664 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14665 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14666}
14667
14668
14669IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14670{
14671 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14672 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14673}
14674#endif
14675
14676
14677/**
14678 * ADDSD
14679 */
14680#ifdef IEM_WITHOUT_ASSEMBLY
14681IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14682{
14683 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14684 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14685}
14686#endif
14687
14688
14689/**
14690 * MULPS
14691 */
14692#ifdef IEM_WITHOUT_ASSEMBLY
14693static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14694{
14695 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14696 return fMxcsr;
14697
14698 RTFLOAT32U r32Src1, r32Src2;
14699 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14700 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14701 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14702 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14703 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14704}
14705
14706
14707IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14708{
14709 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14710 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14711 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14712 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14713}
14714#endif
14715
14716
14717/**
14718 * MULSS
14719 */
14720#ifdef IEM_WITHOUT_ASSEMBLY
14721IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14722{
14723 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14724 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14725 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14726 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14727}
14728#endif
14729
14730
14731/**
14732 * MULPD
14733 */
14734#ifdef IEM_WITHOUT_ASSEMBLY
14735static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14736{
14737 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14738 return fMxcsr;
14739
14740 RTFLOAT64U r64Src1, r64Src2;
14741 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14742 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14743 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14744 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14745 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14746}
14747
14748
14749IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14750{
14751 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14752 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14753}
14754#endif
14755
14756
14757/**
14758 * MULSD
14759 */
14760#ifdef IEM_WITHOUT_ASSEMBLY
14761IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14762{
14763 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14764 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14765}
14766#endif
14767
14768
14769/**
14770 * SUBPS
14771 */
14772#ifdef IEM_WITHOUT_ASSEMBLY
14773static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14774{
14775 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14776 return fMxcsr;
14777
14778 RTFLOAT32U r32Src1, r32Src2;
14779 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14780 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14781 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14782 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14783 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14784}
14785
14786
14787IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14788{
14789 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14790 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14791 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14792 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14793}
14794#endif
14795
14796
14797/**
14798 * SUBSS
14799 */
14800#ifdef IEM_WITHOUT_ASSEMBLY
14801IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14802{
14803 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14804 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14805 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14806 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14807}
14808#endif
14809
14810
14811/**
14812 * SUBPD
14813 */
14814#ifdef IEM_WITHOUT_ASSEMBLY
14815static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14816{
14817 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14818 return fMxcsr;
14819
14820 RTFLOAT64U r64Src1, r64Src2;
14821 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14822 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14823 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14824 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14825 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14826}
14827
14828
14829IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14830{
14831 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14832 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14833}
14834#endif
14835
14836
14837/**
14838 * SUBSD
14839 */
14840#ifdef IEM_WITHOUT_ASSEMBLY
14841IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14842{
14843 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14844 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14845}
14846#endif
14847
14848
14849/**
14850 * MINPS
14851 */
14852#ifdef IEM_WITHOUT_ASSEMBLY
14853static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14854{
14855 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
14856 {
14857 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14858 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
14859 return fMxcsr | X86_MXCSR_IE;
14860 }
14861
14862 RTFLOAT32U r32Src1, r32Src2;
14863 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14864 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14865 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
14866 {
14867 *pr32Res = r32Src2;
14868 return fMxcsr;
14869 }
14870
14871 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14872 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14873 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
14874 fLe
14875 ? iemFpSoftF32FromIprt(&r32Src1)
14876 : iemFpSoftF32FromIprt(&r32Src2),
14877 pr32Res, fMxcsr);
14878}
14879
14880
14881IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14882{
14883 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14884 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14885 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14886 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14887}
14888#endif
14889
14890
14891/**
14892 * MINSS
14893 */
14894#ifdef IEM_WITHOUT_ASSEMBLY
14895IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14896{
14897 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14898 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14899 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14900 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14901}
14902#endif
14903
14904
14905/**
14906 * MINPD
14907 */
14908#ifdef IEM_WITHOUT_ASSEMBLY
14909static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14910{
14911 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
14912 {
14913 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14914 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
14915 return fMxcsr | X86_MXCSR_IE;
14916 }
14917
14918 RTFLOAT64U r64Src1, r64Src2;
14919 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14920 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14921 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
14922 {
14923 *pr64Res = r64Src2;
14924 return fMxcsr;
14925 }
14926
14927 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14928 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14929 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
14930 fLe
14931 ? iemFpSoftF64FromIprt(&r64Src1)
14932 : iemFpSoftF64FromIprt(&r64Src2),
14933 pr64Res, fMxcsr);
14934}
14935
14936
14937IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14938{
14939 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14940 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14941}
14942#endif
14943
14944
14945/**
14946 * MINSD
14947 */
14948#ifdef IEM_WITHOUT_ASSEMBLY
14949IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14950{
14951 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14952 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14953}
14954#endif
14955
14956
14957/**
14958 * DIVPS
14959 */
14960#ifdef IEM_WITHOUT_ASSEMBLY
14961static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14962{
14963 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14964 return fMxcsr;
14965
14966 RTFLOAT32U r32Src1, r32Src2;
14967 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14968 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14969 if (RTFLOAT32U_IS_ZERO(&r32Src2))
14970 {
14971 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
14972 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
14973 {
14974 *pr32Res = g_ar32QNaN[1];
14975 return fMxcsr | X86_MXCSR_IE;
14976 }
14977 else if (RTFLOAT32U_IS_INF(&r32Src1))
14978 {
14979 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
14980 return fMxcsr;
14981 }
14982 else
14983 {
14984 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
14985 return fMxcsr | X86_MXCSR_ZE;
14986 }
14987 }
14988
14989 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14990 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14991 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
14992}
14993
14994
14995IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14996{
14997 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14998 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14999 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15000 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15001}
15002#endif
15003
15004
15005/**
15006 * DIVSS
15007 */
15008#ifdef IEM_WITHOUT_ASSEMBLY
15009IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15010{
15011 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15012 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15013 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15014 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15015}
15016#endif
15017
15018
15019/**
15020 * DIVPD
15021 */
15022#ifdef IEM_WITHOUT_ASSEMBLY
15023static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15024{
15025 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15026 return fMxcsr;
15027
15028 RTFLOAT64U r64Src1, r64Src2;
15029 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15030 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15031 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15032 {
15033 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15034 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15035 {
15036 *pr64Res = g_ar64QNaN[1];
15037 return fMxcsr | X86_MXCSR_IE;
15038 }
15039 else if (RTFLOAT64U_IS_INF(&r64Src1))
15040 {
15041 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15042 return fMxcsr;
15043 }
15044 else
15045 {
15046 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15047 return fMxcsr | X86_MXCSR_ZE;
15048 }
15049 }
15050
15051 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15052 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15053 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15054}
15055
15056
15057IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15058{
15059 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15060 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15061}
15062#endif
15063
15064
15065/**
15066 * DIVSD
15067 */
15068#ifdef IEM_WITHOUT_ASSEMBLY
15069IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15070{
15071 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15072 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15073}
15074#endif
15075
15076
15077/**
15078 * MAXPS
15079 */
15080#ifdef IEM_WITHOUT_ASSEMBLY
15081static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15082{
15083 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15084 {
15085 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15086 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15087 return fMxcsr | X86_MXCSR_IE;
15088 }
15089
15090 RTFLOAT32U r32Src1, r32Src2;
15091 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15092 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15093 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15094 {
15095 *pr32Res = r32Src2;
15096 return fMxcsr;
15097 }
15098
15099 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15100 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15101 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15102 fLe
15103 ? iemFpSoftF32FromIprt(&r32Src2)
15104 : iemFpSoftF32FromIprt(&r32Src1),
15105 pr32Res, fMxcsr);
15106}
15107
15108
15109IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15110{
15111 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15112 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15113 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15114 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15115}
15116#endif
15117
15118
15119/**
15120 * MAXSS
15121 */
15122#ifdef IEM_WITHOUT_ASSEMBLY
15123IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15124{
15125 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15126 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15127 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15128 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15129}
15130#endif
15131
15132
15133/**
15134 * MAXPD
15135 */
15136#ifdef IEM_WITHOUT_ASSEMBLY
15137static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15138{
15139 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15140 {
15141 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15142 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15143 return fMxcsr | X86_MXCSR_IE;
15144 }
15145
15146 RTFLOAT64U r64Src1, r64Src2;
15147 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15148 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15149 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15150 {
15151 *pr64Res = r64Src2;
15152 return fMxcsr;
15153 }
15154
15155 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15156 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15157 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15158 fLe
15159 ? iemFpSoftF64FromIprt(&r64Src2)
15160 : iemFpSoftF64FromIprt(&r64Src1),
15161 pr64Res, fMxcsr);
15162}
15163
15164
15165IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15166{
15167 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15168 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15169}
15170#endif
15171
15172
15173/**
15174 * MAXSD
15175 */
15176#ifdef IEM_WITHOUT_ASSEMBLY
15177IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15178{
15179 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15180 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15181}
15182#endif
15183
15184
15185/**
15186 * CVTSS2SD
15187 */
15188#ifdef IEM_WITHOUT_ASSEMBLY
15189static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15190{
15191 RTFLOAT32U r32Src1;
15192 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15193
15194 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15195 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15196 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15197}
15198
15199
15200IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15201{
15202 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15203 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15204}
15205#endif
15206
15207
15208/**
15209 * CVTSD2SS
15210 */
15211#ifdef IEM_WITHOUT_ASSEMBLY
15212static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15213{
15214 RTFLOAT64U r64Src1;
15215 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15216
15217 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15218 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15219 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15220}
15221
15222
15223IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15224{
15225 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15226 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15227 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15228 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15229}
15230#endif
15231
15232
15233/**
15234 * HADDPS
15235 */
15236#ifdef IEM_WITHOUT_ASSEMBLY
15237IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15238{
15239 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15240 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15241 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15242 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15243}
15244#endif
15245
15246
15247/**
15248 * HADDPD
15249 */
15250#ifdef IEM_WITHOUT_ASSEMBLY
15251IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15252{
15253 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15254 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15255}
15256#endif
15257
15258
15259/**
15260 * HSUBPS
15261 */
15262#ifdef IEM_WITHOUT_ASSEMBLY
15263IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15264{
15265 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15266 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15267 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15268 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15269}
15270#endif
15271
15272
15273/**
15274 * HSUBPD
15275 */
15276#ifdef IEM_WITHOUT_ASSEMBLY
15277IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15278{
15279 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15280 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15281}
15282#endif
15283
15284
15285/**
15286 * SQRTPS
15287 */
15288#ifdef IEM_WITHOUT_ASSEMBLY
15289static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15290{
15291 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15292 return fMxcsr;
15293
15294 RTFLOAT32U r32Src;
15295 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15296 if (RTFLOAT32U_IS_ZERO(&r32Src))
15297 {
15298 *pr32Res = r32Src;
15299 return fMxcsr;
15300 }
15301 else if (r32Src.s.fSign)
15302 {
15303 *pr32Res = g_ar32QNaN[1];
15304 return fMxcsr | X86_MXCSR_IE;
15305 }
15306
15307 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15308 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15309 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15310}
15311
15312
15313IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15314{
15315 RT_NOREF(puSrc1);
15316
15317 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15318 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15319 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15320 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15321}
15322#endif
15323
15324
15325/**
15326 * SQRTSS
15327 */
15328#ifdef IEM_WITHOUT_ASSEMBLY
15329IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15330{
15331 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15332 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15333 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15334 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15335}
15336#endif
15337
15338
15339/**
15340 * SQRTPD
15341 */
15342#ifdef IEM_WITHOUT_ASSEMBLY
15343static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15344{
15345 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15346 return fMxcsr;
15347
15348 RTFLOAT64U r64Src;
15349 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15350 if (RTFLOAT64U_IS_ZERO(&r64Src))
15351 {
15352 *pr64Res = r64Src;
15353 return fMxcsr;
15354 }
15355 else if (r64Src.s.fSign)
15356 {
15357 *pr64Res = g_ar64QNaN[1];
15358 return fMxcsr | X86_MXCSR_IE;
15359 }
15360
15361 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15362 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15363 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15364}
15365
15366
15367IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15368{
15369 RT_NOREF(puSrc1);
15370
15371 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15372 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15373}
15374#endif
15375
15376
15377/**
15378 * SQRTSD
15379 */
15380#ifdef IEM_WITHOUT_ASSEMBLY
15381IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15382{
15383 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15384 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15385}
15386#endif
15387
15388
15389/**
15390 * ADDSUBPS
15391 */
15392#ifdef IEM_WITHOUT_ASSEMBLY
15393IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15394{
15395 RT_NOREF(puSrc1);
15396
15397 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15398 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15399 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15400 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15401}
15402#endif
15403
15404
15405/**
15406 * ADDSUBPD
15407 */
15408#ifdef IEM_WITHOUT_ASSEMBLY
15409IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15410{
15411 RT_NOREF(puSrc1);
15412
15413 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15414 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15415}
15416#endif
15417
15418
15419/**
15420 * CVTPD2PS
15421 */
15422#ifdef IEM_WITHOUT_ASSEMBLY
15423static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15424{
15425 RTFLOAT64U r64Src1;
15426 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15427
15428 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15429 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15430 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15431}
15432
15433
15434IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15435{
15436 RT_NOREF(puSrc1);
15437
15438 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15439 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15440 pResult->uResult.au32[2] = 0;
15441 pResult->uResult.au32[3] = 0;
15442}
15443#endif
15444
15445
15446/**
15447 * CVTPS2PD
15448 */
15449#ifdef IEM_WITHOUT_ASSEMBLY
15450static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15451{
15452 RTFLOAT32U r32Src1;
15453 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15454
15455 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15456 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15457 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15458}
15459
15460
15461IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15462{
15463 RT_NOREF(puSrc1);
15464
15465 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15466 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15467}
15468#endif
15469
15470
15471/**
15472 * [V]SHUFPS
15473 */
15474#ifdef IEM_WITHOUT_ASSEMBLY
15475IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15476{
15477 RTUINT128U const uSrc1 = *puDst;
15478 RTUINT128U const uSrc2 = *puSrc;
15479 ASMCompilerBarrier();
15480 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15481 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15482 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15483 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15484}
15485#endif
15486
15487
15488IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15489{
15490 RTUINT128U const uSrc1 = *puSrc1;
15491 RTUINT128U const uSrc2 = *puSrc2;
15492 ASMCompilerBarrier();
15493 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15494 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15495 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15496 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15497}
15498
15499
15500IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15501{
15502 RTUINT256U const uSrc1 = *puSrc1;
15503 RTUINT256U const uSrc2 = *puSrc2;
15504 ASMCompilerBarrier();
15505 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15506 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15507 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15508 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15509
15510 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15511 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15512 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15513 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15514}
15515
15516
15517/**
15518 * [V]SHUFPD
15519 */
15520#ifdef IEM_WITHOUT_ASSEMBLY
15521IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15522{
15523 RTUINT128U const uSrc1 = *puDst;
15524 RTUINT128U const uSrc2 = *puSrc;
15525 ASMCompilerBarrier();
15526 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15527 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15528}
15529#endif
15530
15531
15532IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15533{
15534 RTUINT128U const uSrc1 = *puSrc1;
15535 RTUINT128U const uSrc2 = *puSrc2;
15536 ASMCompilerBarrier();
15537 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15538 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15539}
15540
15541
15542IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15543{
15544 RTUINT256U const uSrc1 = *puSrc1;
15545 RTUINT256U const uSrc2 = *puSrc2;
15546 ASMCompilerBarrier();
15547 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15548 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15549 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15550 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15551}
15552
15553
15554/*
15555 * PHMINPOSUW / VPHMINPOSUW
15556 */
15557IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15558{
15559 uint16_t u16Min = puSrc->au16[0];
15560 uint8_t idxMin = 0;
15561
15562 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15563 if (puSrc->au16[i] < u16Min)
15564 {
15565 u16Min = puSrc->au16[i];
15566 idxMin = i;
15567 }
15568
15569 puDst->au64[0] = 0;
15570 puDst->au64[1] = 0;
15571 puDst->au16[0] = u16Min;
15572 puDst->au16[1] = idxMin;
15573}
15574
15575
15576IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15577{
15578 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15579}
15580
15581
15582/*
15583 * [V]PBLENDVB
15584 */
15585IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15586{
15587 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15588 if (puMask->au8[i] & RT_BIT(7))
15589 puDst->au8[i] = puSrc->au8[i];
15590}
15591
15592
15593IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15594{
15595 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15596 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15597}
15598
15599
15600IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15601{
15602 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15603 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15604}
15605
15606
15607/*
15608 * [V]BLENDVPS
15609 */
15610IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15611{
15612 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15613 if (puMask->au32[i] & RT_BIT_32(31))
15614 puDst->au32[i] = puSrc->au32[i];
15615}
15616
15617
15618IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15619{
15620 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15621 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15622}
15623
15624
15625IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15626{
15627 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15628 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15629}
15630
15631
15632/*
15633 * [V]BLENDVPD
15634 */
15635IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15636{
15637 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
15638 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
15639}
15640
15641
15642IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15643{
15644 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15645 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15646}
15647
15648
15649IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15650{
15651 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15652 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15653}
15654
15655
15656/**
15657 * [V]PALIGNR
15658 */
15659IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
15660{
15661 uint64_t const u64Src1 = *pu64Dst;
15662 ASMCompilerBarrier();
15663
15664 if (bEvil >= 16)
15665 *pu64Dst = 0;
15666 else if (bEvil >= 8)
15667 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
15668 else
15669 {
15670 uint8_t cShift = bEvil * 8;
15671 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
15672 | (u64Src2 >> cShift);
15673 }
15674}
15675
15676
15677IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15678{
15679 RTUINT128U const uSrc1 = *puDst;
15680 RTUINT128U const uSrc2 = *puSrc;
15681 ASMCompilerBarrier();
15682
15683 puDst->au64[0] = 0;
15684 puDst->au64[1] = 0;
15685 if (bEvil >= 32)
15686 { /* Everything stays 0. */ }
15687 else if (bEvil >= 16)
15688 {
15689 bEvil -= 16;
15690 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15691 puDst->au8[i - bEvil] = uSrc1.au8[i];
15692 }
15693 else
15694 {
15695 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15696 puDst->au8[i] = uSrc2.au8[i + bEvil];
15697 for (uint8_t i = 0; i < bEvil; i++)
15698 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15699 }
15700}
15701
15702
15703IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15704{
15705 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
15706 RTUINT128U const uSrc2 = *puSrc2;
15707 ASMCompilerBarrier();
15708
15709 puDst->au64[0] = 0;
15710 puDst->au64[1] = 0;
15711 if (bEvil >= 32)
15712 { /* Everything stays 0. */ }
15713 else if (bEvil >= 16)
15714 {
15715 bEvil -= 16;
15716 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15717 puDst->au8[i - bEvil] = uSrc1.au8[i];
15718 }
15719 else
15720 {
15721 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15722 puDst->au8[i] = uSrc2.au8[i + bEvil];
15723 for (uint8_t i = 0; i < bEvil; i++)
15724 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15725 }
15726}
15727
15728
15729IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15730{
15731 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
15732 RTUINT256U const uSrc2 = *puSrc2;
15733 ASMCompilerBarrier();
15734
15735 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
15736 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
15737}
15738
15739
15740/**
15741 * [V]PBLENDW
15742 */
15743IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15744{
15745 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
15746 if (bEvil & RT_BIT(i))
15747 puDst->au16[i] = puSrc->au16[i];
15748}
15749
15750
15751IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15752{
15753 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
15754 if (bEvil & RT_BIT(i))
15755 puDst->au16[i] = puSrc2->au16[i];
15756 else
15757 puDst->au16[i] = puSrc1->au16[i];
15758}
15759
15760
15761IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15762{
15763 for (uint8_t i = 0; i < 8; i++)
15764 if (bEvil & RT_BIT(i))
15765 {
15766 puDst->au16[ i] = puSrc2->au16[ i];
15767 puDst->au16[8 + i] = puSrc2->au16[8 + i];
15768 }
15769 else
15770 {
15771 puDst->au16[ i] = puSrc1->au16[ i];
15772 puDst->au16[8 + i] = puSrc1->au16[8 + i];
15773 }
15774}
15775
15776
15777/**
15778 * [V]BLENDPS
15779 */
15780IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15781{
15782 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15783 if (bEvil & RT_BIT(i))
15784 puDst->au32[i] = puSrc->au32[i];
15785}
15786
15787
15788IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15789{
15790 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15791 if (bEvil & RT_BIT(i))
15792 puDst->au32[i] = puSrc2->au32[i];
15793 else
15794 puDst->au32[i] = puSrc1->au32[i];
15795}
15796
15797
15798IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15799{
15800 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15801 if (bEvil & RT_BIT(i))
15802 puDst->au32[i] = puSrc2->au32[i];
15803 else
15804 puDst->au32[i] = puSrc1->au32[i];
15805}
15806
15807
15808/**
15809 * [V]BLENDPD
15810 */
15811IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15812{
15813 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15814 if (bEvil & RT_BIT(i))
15815 puDst->au64[i] = puSrc->au64[i];
15816}
15817
15818
15819IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15820{
15821 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15822 if (bEvil & RT_BIT(i))
15823 puDst->au64[i] = puSrc2->au64[i];
15824 else
15825 puDst->au64[i] = puSrc1->au64[i];
15826}
15827
15828
15829IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15830{
15831 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15832 if (bEvil & RT_BIT(i))
15833 puDst->au64[i] = puSrc2->au64[i];
15834 else
15835 puDst->au64[i] = puSrc1->au64[i];
15836}
15837
15838
15839/**
15840 * [V]PCMPISTRI
15841 */
15842IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRISRC pSrc, uint8_t bEvil))
15843{
15844 RT_NOREF(pu32Ecx, pEFlags, pSrc, bEvil);
15845 AssertReleaseFailed();
15846}
15847
15848
15849/*
15850 * [V]PCLMULQDQ
15851 */
15852IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15853{
15854 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
15855}
15856
15857
15858IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15859{
15860 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
15861 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
15862
15863 puDst->au64[0] = 0;
15864 puDst->au64[1] = 0;
15865
15866 /*
15867 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
15868 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
15869 * and squeeze out some optimizations.
15870 */
15871 if (uSrc1 & 0x1)
15872 puDst->au64[0] = uSrc2;
15873
15874 uSrc1 >>= 1;
15875
15876 uint8_t iDigit = 1;
15877 while (uSrc1)
15878 {
15879 if (uSrc1 & 0x1)
15880 {
15881 puDst->au64[0] ^= (uSrc2 << iDigit);
15882 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
15883 }
15884
15885 uSrc1 >>= 1;
15886 iDigit++;
15887 }
15888}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette