VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 96109

Last change on this file since 96109 was 96109, checked in by vboxsync, 2 years ago

VMM/IEM: Implement [v]unpck{l,h}p{s,d} instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 515.5 KB
Line 
1/* $Id: IEMAllAImplC.cpp 96109 2022-08-08 11:41:33Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT80U g_ar80Zero[];
451extern const RTFLOAT80U g_ar80One[];
452extern const RTFLOAT80U g_r80Indefinite;
453extern const RTFLOAT80U g_ar80Infinity[];
454extern const RTFLOAT128U g_r128Ln2;
455extern const RTUINT128U g_u128Ln2Mantissa;
456extern const RTUINT128U g_u128Ln2MantissaIntel;
457extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
458
459/** Zero values (indexed by fSign). */
460RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
461
462/** One values (indexed by fSign). */
463RTFLOAT80U const g_ar80One[] =
464{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
465
466/** Indefinite (negative). */
467RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
468
469/** Infinities (indexed by fSign). */
470RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
471
472#if 0
473/** 128-bit floating point constant: 2.0 */
474const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
475#endif
476
477
478/* The next section is generated by tools/IEMGenFpuConstants: */
479
480/** The ln2 constant as 128-bit floating point value.
481 * base-10: 6.93147180559945309417232121458176575e-1
482 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
483 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
484//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
485const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
486/** High precision ln2 value.
487 * base-10: 6.931471805599453094172321214581765680747e-1
488 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
489 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
490const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
491/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
492 * base-10: 6.931471805599453094151379470289064954613e-1
493 * base-16: b.17217f7d1cf79abc0000000000000000@-1
494 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
495const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
496
497/** Horner constants for f2xm1 */
498const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
499{
500 /* a0
501 * base-10: 1.00000000000000000000000000000000000e0
502 * base-16: 1.0000000000000000000000000000@0
503 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
504 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
505 /* a1
506 * base-10: 5.00000000000000000000000000000000000e-1
507 * base-16: 8.0000000000000000000000000000@-1
508 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
509 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
510 /* a2
511 * base-10: 1.66666666666666666666666666666666658e-1
512 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
513 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
514 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
515 /* a3
516 * base-10: 4.16666666666666666666666666666666646e-2
517 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
518 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
519 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
520 /* a4
521 * base-10: 8.33333333333333333333333333333333323e-3
522 * base-16: 2.2222222222222222222222222222@-2
523 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
524 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
525 /* a5
526 * base-10: 1.38888888888888888888888888888888874e-3
527 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
528 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
529 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
530 /* a6
531 * base-10: 1.98412698412698412698412698412698412e-4
532 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
533 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
534 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
535 /* a7
536 * base-10: 2.48015873015873015873015873015873015e-5
537 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
538 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
539 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
540 /* a8
541 * base-10: 2.75573192239858906525573192239858902e-6
542 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
543 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
544 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
545 /* a9
546 * base-10: 2.75573192239858906525573192239858865e-7
547 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
548 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
549 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
550 /* a10
551 * base-10: 2.50521083854417187750521083854417184e-8
552 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
553 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
554 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
555 /* a11
556 * base-10: 2.08767569878680989792100903212014296e-9
557 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
558 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
559 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
560 /* a12
561 * base-10: 1.60590438368216145993923771701549472e-10
562 * base-16: b.092309d43684be51c198e91d7b40@-9
563 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
564 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
565 /* a13
566 * base-10: 1.14707455977297247138516979786821043e-11
567 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
568 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
569 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
570 /* a14
571 * base-10: 7.64716373181981647590113198578806964e-13
572 * base-16: d.73f9f399dc0f88ec32b587746578@-11
573 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
574 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
575 /* a15
576 * base-10: 4.77947733238738529743820749111754352e-14
577 * base-16: d.73f9f399dc0f88ec32b587746578@-12
578 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
579 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
580 /* a16
581 * base-10: 2.81145725434552076319894558301031970e-15
582 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
583 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
584 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
585 /* a17
586 * base-10: 1.56192069685862264622163643500573321e-16
587 * base-16: b.413c31dcbecbbdd8024435161550@-14
588 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
589 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
590 /* a18
591 * base-10: 8.22063524662432971695598123687227980e-18
592 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
593 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
594 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
595 /* a19
596 * base-10: 4.11031762331216485847799061843614006e-19
597 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
598 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
599 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
600 /* a20
601 * base-10: 7.04351638180413298434020229233492164e-20
602 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
603 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
604 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
605 /* a21
606 * base-10: 5.81527769640186708776361513365257702e-20
607 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
608 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
609 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
610};
611
612
613/*
614 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
615 * it all in C is probably safer atm., optimize what's necessary later, maybe.
616 */
617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
618
619
620/*********************************************************************************************************************************
621* Binary Operations *
622*********************************************************************************************************************************/
623
624/*
625 * ADD
626 */
627
628IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
629{
630 uint64_t uDst = *puDst;
631 uint64_t uResult = uDst + uSrc;
632 *puDst = uResult;
633 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
634}
635
636# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
637
638IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
639{
640 uint32_t uDst = *puDst;
641 uint32_t uResult = uDst + uSrc;
642 *puDst = uResult;
643 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
644}
645
646
647IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
648{
649 uint16_t uDst = *puDst;
650 uint16_t uResult = uDst + uSrc;
651 *puDst = uResult;
652 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
653}
654
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
657{
658 uint8_t uDst = *puDst;
659 uint8_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
662}
663
664# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
665
666/*
667 * ADC
668 */
669
670IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
671{
672 if (!(*pfEFlags & X86_EFL_CF))
673 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
674 else
675 {
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc + 1;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
680 }
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint32_t uDst = *puDst;
692 uint32_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
695 }
696}
697
698
699IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
700{
701 if (!(*pfEFlags & X86_EFL_CF))
702 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
703 else
704 {
705 uint16_t uDst = *puDst;
706 uint16_t uResult = uDst + uSrc + 1;
707 *puDst = uResult;
708 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
709 }
710}
711
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint8_t uDst = *puDst;
720 uint8_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
723 }
724}
725
726# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
727
728/*
729 * SUB
730 */
731
732IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
733{
734 uint64_t uDst = *puDst;
735 uint64_t uResult = uDst - uSrc;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
738}
739
740# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
741
742IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
743{
744 uint32_t uDst = *puDst;
745 uint32_t uResult = uDst - uSrc;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
748}
749
750
751IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
752{
753 uint16_t uDst = *puDst;
754 uint16_t uResult = uDst - uSrc;
755 *puDst = uResult;
756 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
757}
758
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
761{
762 uint8_t uDst = *puDst;
763 uint8_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
766}
767
768# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
769
770/*
771 * SBB
772 */
773
774IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
775{
776 if (!(*pfEFlags & X86_EFL_CF))
777 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
778 else
779 {
780 uint64_t uDst = *puDst;
781 uint64_t uResult = uDst - uSrc - 1;
782 *puDst = uResult;
783 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
784 }
785}
786
787# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
799 }
800}
801
802
803IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
804{
805 if (!(*pfEFlags & X86_EFL_CF))
806 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
807 else
808 {
809 uint16_t uDst = *puDst;
810 uint16_t uResult = uDst - uSrc - 1;
811 *puDst = uResult;
812 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
813 }
814}
815
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint8_t uDst = *puDst;
824 uint8_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
827 }
828}
829
830# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
831
832
833/*
834 * OR
835 */
836
837IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
838{
839 uint64_t uResult = *puDst | uSrc;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
842}
843
844# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
845
846IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
847{
848 uint32_t uResult = *puDst | uSrc;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
851}
852
853
854IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
855{
856 uint16_t uResult = *puDst | uSrc;
857 *puDst = uResult;
858 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
859}
860
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
863{
864 uint8_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
867}
868
869# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
870
871/*
872 * XOR
873 */
874
875IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
876{
877 uint64_t uResult = *puDst ^ uSrc;
878 *puDst = uResult;
879 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
880}
881
882# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
885{
886 uint32_t uResult = *puDst ^ uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
893{
894 uint16_t uResult = *puDst ^ uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
897}
898
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
901{
902 uint8_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
905}
906
907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
908
909/*
910 * AND
911 */
912
913IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
914{
915 uint64_t const uResult = *puDst & uSrc;
916 *puDst = uResult;
917 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
918}
919
920# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
923{
924 uint32_t const uResult = *puDst & uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
931{
932 uint16_t const uResult = *puDst & uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
935}
936
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
939{
940 uint8_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
943}
944
945# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
946#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
947
948/*
949 * ANDN (BMI1 instruction)
950 */
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
953{
954 uint64_t const uResult = ~uSrc1 & uSrc2;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
961{
962 uint32_t const uResult = ~uSrc1 & uSrc2;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
965}
966
967
968#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
969IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
970{
971 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
972}
973#endif
974
975
976#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
978{
979 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
980}
981#endif
982
983#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
984
985/*
986 * CMP
987 */
988
989IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
990{
991 uint64_t uDstTmp = *puDst;
992 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
993}
994
995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
996
997IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
998{
999 uint32_t uDstTmp = *puDst;
1000 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint16_t uDstTmp = *puDst;
1007 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010
1011IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1012{
1013 uint8_t uDstTmp = *puDst;
1014 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1015}
1016
1017# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1018
1019/*
1020 * TEST
1021 */
1022
1023IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1024{
1025 uint64_t uResult = *puDst & uSrc;
1026 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1027}
1028
1029# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1030
1031IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1032{
1033 uint32_t uResult = *puDst & uSrc;
1034 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint16_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1042}
1043
1044
1045IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1046{
1047 uint8_t uResult = *puDst & uSrc;
1048 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1049}
1050
1051# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1052
1053
1054/*
1055 * LOCK prefixed variants of the above
1056 */
1057
1058/** 64-bit locked binary operand operation. */
1059# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1060 do { \
1061 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1062 uint ## a_cBitsWidth ## _t uTmp; \
1063 uint32_t fEflTmp; \
1064 do \
1065 { \
1066 uTmp = uOld; \
1067 fEflTmp = *pfEFlags; \
1068 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1069 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1070 *pfEFlags = fEflTmp; \
1071 } while (0)
1072
1073
1074#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1076 uint ## a_cBitsWidth ## _t uSrc, \
1077 uint32_t *pfEFlags)) \
1078 { \
1079 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1080 }
1081
1082EMIT_LOCKED_BIN_OP(add, 64)
1083EMIT_LOCKED_BIN_OP(adc, 64)
1084EMIT_LOCKED_BIN_OP(sub, 64)
1085EMIT_LOCKED_BIN_OP(sbb, 64)
1086EMIT_LOCKED_BIN_OP(or, 64)
1087EMIT_LOCKED_BIN_OP(xor, 64)
1088EMIT_LOCKED_BIN_OP(and, 64)
1089# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1090EMIT_LOCKED_BIN_OP(add, 32)
1091EMIT_LOCKED_BIN_OP(adc, 32)
1092EMIT_LOCKED_BIN_OP(sub, 32)
1093EMIT_LOCKED_BIN_OP(sbb, 32)
1094EMIT_LOCKED_BIN_OP(or, 32)
1095EMIT_LOCKED_BIN_OP(xor, 32)
1096EMIT_LOCKED_BIN_OP(and, 32)
1097
1098EMIT_LOCKED_BIN_OP(add, 16)
1099EMIT_LOCKED_BIN_OP(adc, 16)
1100EMIT_LOCKED_BIN_OP(sub, 16)
1101EMIT_LOCKED_BIN_OP(sbb, 16)
1102EMIT_LOCKED_BIN_OP(or, 16)
1103EMIT_LOCKED_BIN_OP(xor, 16)
1104EMIT_LOCKED_BIN_OP(and, 16)
1105
1106EMIT_LOCKED_BIN_OP(add, 8)
1107EMIT_LOCKED_BIN_OP(adc, 8)
1108EMIT_LOCKED_BIN_OP(sub, 8)
1109EMIT_LOCKED_BIN_OP(sbb, 8)
1110EMIT_LOCKED_BIN_OP(or, 8)
1111EMIT_LOCKED_BIN_OP(xor, 8)
1112EMIT_LOCKED_BIN_OP(and, 8)
1113# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1114
1115
1116/*
1117 * Bit operations (same signature as above).
1118 */
1119
1120/*
1121 * BT
1122 */
1123
1124IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1125{
1126 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1127 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1128 Assert(uSrc < 64);
1129 uint64_t uDst = *puDst;
1130 if (uDst & RT_BIT_64(uSrc))
1131 *pfEFlags |= X86_EFL_CF;
1132 else
1133 *pfEFlags &= ~X86_EFL_CF;
1134}
1135
1136# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1137
1138IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1139{
1140 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1141 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1142 Assert(uSrc < 32);
1143 uint32_t uDst = *puDst;
1144 if (uDst & RT_BIT_32(uSrc))
1145 *pfEFlags |= X86_EFL_CF;
1146 else
1147 *pfEFlags &= ~X86_EFL_CF;
1148}
1149
1150IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1151{
1152 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1153 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1154 Assert(uSrc < 16);
1155 uint16_t uDst = *puDst;
1156 if (uDst & RT_BIT_32(uSrc))
1157 *pfEFlags |= X86_EFL_CF;
1158 else
1159 *pfEFlags &= ~X86_EFL_CF;
1160}
1161
1162# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1163
1164/*
1165 * BTC
1166 */
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 64);
1173 uint64_t fMask = RT_BIT_64(uSrc);
1174 uint64_t uDst = *puDst;
1175 if (uDst & fMask)
1176 {
1177 uDst &= ~fMask;
1178 *puDst = uDst;
1179 *pfEFlags |= X86_EFL_CF;
1180 }
1181 else
1182 {
1183 uDst |= fMask;
1184 *puDst = uDst;
1185 *pfEFlags &= ~X86_EFL_CF;
1186 }
1187}
1188
1189# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1190
1191IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1192{
1193 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1194 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1195 Assert(uSrc < 32);
1196 uint32_t fMask = RT_BIT_32(uSrc);
1197 uint32_t uDst = *puDst;
1198 if (uDst & fMask)
1199 {
1200 uDst &= ~fMask;
1201 *puDst = uDst;
1202 *pfEFlags |= X86_EFL_CF;
1203 }
1204 else
1205 {
1206 uDst |= fMask;
1207 *puDst = uDst;
1208 *pfEFlags &= ~X86_EFL_CF;
1209 }
1210}
1211
1212
1213IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1214{
1215 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1216 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1217 Assert(uSrc < 16);
1218 uint16_t fMask = RT_BIT_32(uSrc);
1219 uint16_t uDst = *puDst;
1220 if (uDst & fMask)
1221 {
1222 uDst &= ~fMask;
1223 *puDst = uDst;
1224 *pfEFlags |= X86_EFL_CF;
1225 }
1226 else
1227 {
1228 uDst |= fMask;
1229 *puDst = uDst;
1230 *pfEFlags &= ~X86_EFL_CF;
1231 }
1232}
1233
1234# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1235
1236/*
1237 * BTR
1238 */
1239
1240IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1241{
1242 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1243 logical operation (AND/OR/whatever). */
1244 Assert(uSrc < 64);
1245 uint64_t fMask = RT_BIT_64(uSrc);
1246 uint64_t uDst = *puDst;
1247 if (uDst & fMask)
1248 {
1249 uDst &= ~fMask;
1250 *puDst = uDst;
1251 *pfEFlags |= X86_EFL_CF;
1252 }
1253 else
1254 *pfEFlags &= ~X86_EFL_CF;
1255}
1256
1257# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1258
1259IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1260{
1261 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1262 logical operation (AND/OR/whatever). */
1263 Assert(uSrc < 32);
1264 uint32_t fMask = RT_BIT_32(uSrc);
1265 uint32_t uDst = *puDst;
1266 if (uDst & fMask)
1267 {
1268 uDst &= ~fMask;
1269 *puDst = uDst;
1270 *pfEFlags |= X86_EFL_CF;
1271 }
1272 else
1273 *pfEFlags &= ~X86_EFL_CF;
1274}
1275
1276
1277IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1278{
1279 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1280 logical operation (AND/OR/whatever). */
1281 Assert(uSrc < 16);
1282 uint16_t fMask = RT_BIT_32(uSrc);
1283 uint16_t uDst = *puDst;
1284 if (uDst & fMask)
1285 {
1286 uDst &= ~fMask;
1287 *puDst = uDst;
1288 *pfEFlags |= X86_EFL_CF;
1289 }
1290 else
1291 *pfEFlags &= ~X86_EFL_CF;
1292}
1293
1294# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1295
1296/*
1297 * BTS
1298 */
1299
1300IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1301{
1302 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1303 logical operation (AND/OR/whatever). */
1304 Assert(uSrc < 64);
1305 uint64_t fMask = RT_BIT_64(uSrc);
1306 uint64_t uDst = *puDst;
1307 if (uDst & fMask)
1308 *pfEFlags |= X86_EFL_CF;
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1318
1319IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1320{
1321 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1322 logical operation (AND/OR/whatever). */
1323 Assert(uSrc < 32);
1324 uint32_t fMask = RT_BIT_32(uSrc);
1325 uint32_t uDst = *puDst;
1326 if (uDst & fMask)
1327 *pfEFlags |= X86_EFL_CF;
1328 else
1329 {
1330 uDst |= fMask;
1331 *puDst = uDst;
1332 *pfEFlags &= ~X86_EFL_CF;
1333 }
1334}
1335
1336
1337IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1338{
1339 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1340 logical operation (AND/OR/whatever). */
1341 Assert(uSrc < 16);
1342 uint16_t fMask = RT_BIT_32(uSrc);
1343 uint32_t uDst = *puDst;
1344 if (uDst & fMask)
1345 *pfEFlags |= X86_EFL_CF;
1346 else
1347 {
1348 uDst |= fMask;
1349 *puDst = uDst;
1350 *pfEFlags &= ~X86_EFL_CF;
1351 }
1352}
1353
1354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1355
1356
1357EMIT_LOCKED_BIN_OP(btc, 64)
1358EMIT_LOCKED_BIN_OP(btr, 64)
1359EMIT_LOCKED_BIN_OP(bts, 64)
1360# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1361EMIT_LOCKED_BIN_OP(btc, 32)
1362EMIT_LOCKED_BIN_OP(btr, 32)
1363EMIT_LOCKED_BIN_OP(bts, 32)
1364
1365EMIT_LOCKED_BIN_OP(btc, 16)
1366EMIT_LOCKED_BIN_OP(btr, 16)
1367EMIT_LOCKED_BIN_OP(bts, 16)
1368# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1369
1370
1371/*
1372 * Helpers for BSR and BSF.
1373 *
1374 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1375 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1376 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1377 * but we restrict ourselves to emulating these recent marchs.
1378 */
1379#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1380 unsigned iBit = (a_iBit); \
1381 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1382 if (iBit) \
1383 { \
1384 *puDst = --iBit; \
1385 fEfl |= g_afParity[iBit]; \
1386 } \
1387 else \
1388 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1389 *pfEFlags = fEfl; \
1390 } while (0)
1391#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1392 unsigned const iBit = (a_iBit); \
1393 if (iBit) \
1394 { \
1395 *puDst = iBit - 1; \
1396 *pfEFlags &= ~X86_EFL_ZF; \
1397 } \
1398 else \
1399 *pfEFlags |= X86_EFL_ZF; \
1400 } while (0)
1401
1402
1403/*
1404 * BSF - first (least significant) bit set
1405 */
1406IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1407{
1408 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1409}
1410
1411IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1412{
1413 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1414}
1415
1416IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1417{
1418 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1419}
1420
1421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1422
1423IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1424{
1425 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1426}
1427
1428IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1429{
1430 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1431}
1432
1433IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1434{
1435 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1436}
1437
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1447}
1448
1449IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1450{
1451 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1452}
1453
1454# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1455
1456
1457/*
1458 * BSR - last (most significant) bit set
1459 */
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1473}
1474
1475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1480}
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1490}
1491
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1501}
1502
1503IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1504{
1505 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1506}
1507
1508# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1509
1510
1511/*
1512 * Helpers for LZCNT and TZCNT.
1513 */
1514#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1515 unsigned const uResult = (a_uResult); \
1516 *(a_puDst) = uResult; \
1517 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1518 if (uResult) \
1519 fEfl |= g_afParity[uResult]; \
1520 else \
1521 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1522 if (!a_uSrc) \
1523 fEfl |= X86_EFL_CF; \
1524 *(a_pfEFlags) = fEfl; \
1525 } while (0)
1526#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1527 unsigned const uResult = (a_uResult); \
1528 *(a_puDst) = uResult; \
1529 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1530 if (!uResult) \
1531 fEfl |= X86_EFL_ZF; \
1532 if (!a_uSrc) \
1533 fEfl |= X86_EFL_CF; \
1534 *(a_pfEFlags) = fEfl; \
1535 } while (0)
1536
1537
1538/*
1539 * LZCNT - count leading zero bits.
1540 */
1541IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1542{
1543 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1544}
1545
1546IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1547{
1548 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1549}
1550
1551IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1552{
1553 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1554}
1555
1556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1559{
1560 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1566}
1567
1568IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1569{
1570 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1571}
1572
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1575{
1576 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1582}
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1587}
1588
1589# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1590
1591
1592/*
1593 * TZCNT - count leading zero bits.
1594 */
1595IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1596{
1597 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1598}
1599
1600IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1603}
1604
1605IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1606{
1607 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1608}
1609
1610# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1613{
1614 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1615}
1616
1617IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1618{
1619 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1620}
1621
1622IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1623{
1624 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1625}
1626
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1629{
1630 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1636}
1637
1638IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1639{
1640 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1641}
1642
1643# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1644#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1645
1646/*
1647 * BEXTR (BMI1 instruction)
1648 */
1649#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1650IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1651 a_Type uSrc2, uint32_t *pfEFlags)) \
1652{ \
1653 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1654 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1655 a_Type uResult; \
1656 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1657 if (iFirstBit < a_cBits) \
1658 { \
1659 uResult = uSrc1 >> iFirstBit; \
1660 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1661 if (cBits < a_cBits) \
1662 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1663 *puDst = uResult; \
1664 if (!uResult) \
1665 fEfl |= X86_EFL_ZF; \
1666 } \
1667 else \
1668 { \
1669 *puDst = uResult = 0; \
1670 fEfl |= X86_EFL_ZF; \
1671 } \
1672 /** @todo complete flag calculations. */ \
1673 *pfEFlags = fEfl; \
1674}
1675
1676EMIT_BEXTR(64, uint64_t, _fallback)
1677EMIT_BEXTR(32, uint32_t, _fallback)
1678#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1679EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1680#endif
1681#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1682EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1683#endif
1684
1685/*
1686 * BLSR (BMI1 instruction)
1687 */
1688#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1689IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1690{ \
1691 uint32_t fEfl1 = *pfEFlags; \
1692 uint32_t fEfl2 = fEfl1; \
1693 *puDst = uSrc; \
1694 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1695 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1696 \
1697 /* AMD: The carry flag is from the SUB operation. */ \
1698 /* 10890xe: PF always cleared? */ \
1699 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1700 fEfl2 |= fEfl1 & X86_EFL_CF; \
1701 *pfEFlags = fEfl2; \
1702}
1703
1704EMIT_BLSR(64, uint64_t, _fallback)
1705EMIT_BLSR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BLSR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BLSR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSMSK (BMI1 instruction)
1715 */
1716#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSMSK(64, uint64_t, _fallback)
1733EMIT_BLSMSK(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSI (BMI1 instruction)
1743 */
1744#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1751 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSI(64, uint64_t, _fallback)
1761EMIT_BLSI(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSI(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSI(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BZHI (BMI2 instruction)
1771 */
1772#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1774 a_Type uSrc2, uint32_t *pfEFlags)) \
1775{ \
1776 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1777 a_Type uResult; \
1778 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1779 if (iFirstBit < a_cBits) \
1780 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1781 else \
1782 { \
1783 uResult = uSrc1; \
1784 fEfl |= X86_EFL_CF; \
1785 } \
1786 *puDst = uResult; \
1787 fEfl |= X86_EFL_CALC_ZF(uResult); \
1788 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1789 *pfEFlags = fEfl; \
1790}
1791
1792EMIT_BZHI(64, uint64_t, _fallback)
1793EMIT_BZHI(32, uint32_t, _fallback)
1794#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1795EMIT_BZHI(64, uint64_t, RT_NOTHING)
1796#endif
1797#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1798EMIT_BZHI(32, uint32_t, RT_NOTHING)
1799#endif
1800
1801/*
1802 * POPCNT
1803 */
1804RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1805{
1806 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1809 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1810};
1811
1812/** @todo Use native popcount where possible and employ some more efficient
1813 * algorithm here (or in asm.h fallback)! */
1814
1815DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1816{
1817 return g_abBitCounts6[ u16 & 0x3f]
1818 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1819 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1820}
1821
1822DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1823{
1824 return g_abBitCounts6[ u32 & 0x3f]
1825 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1829 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1830}
1831
1832DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1833{
1834 return g_abBitCounts6[ u64 & 0x3f]
1835 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1844 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1845}
1846
1847#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1849{ \
1850 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1851 a_Type uResult; \
1852 if (uSrc) \
1853 uResult = iemPopCountU ## a_cBits(uSrc); \
1854 else \
1855 { \
1856 fEfl |= X86_EFL_ZF; \
1857 uResult = 0; \
1858 } \
1859 *puDst = uResult; \
1860 *pfEFlags = fEfl; \
1861}
1862
1863EMIT_POPCNT(64, uint64_t, _fallback)
1864EMIT_POPCNT(32, uint32_t, _fallback)
1865EMIT_POPCNT(16, uint16_t, _fallback)
1866#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1867EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1868#endif
1869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1870EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1871EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1872#endif
1873
1874
1875#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1876
1877/*
1878 * XCHG
1879 */
1880
1881IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1882{
1883#if ARCH_BITS >= 64
1884 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1885#else
1886 uint64_t uOldMem = *puMem;
1887 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1888 ASMNopPause();
1889 *puReg = uOldMem;
1890#endif
1891}
1892
1893# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1894
1895IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1896{
1897 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1898}
1899
1900
1901IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1902{
1903 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1904}
1905
1906
1907IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1908{
1909 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1910}
1911
1912# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1913
1914
1915/* Unlocked variants for fDisregardLock mode: */
1916
1917IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1918{
1919 uint64_t const uOld = *puMem;
1920 *puMem = *puReg;
1921 *puReg = uOld;
1922}
1923
1924# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1927{
1928 uint32_t const uOld = *puMem;
1929 *puMem = *puReg;
1930 *puReg = uOld;
1931}
1932
1933
1934IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1935{
1936 uint16_t const uOld = *puMem;
1937 *puMem = *puReg;
1938 *puReg = uOld;
1939}
1940
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1943{
1944 uint8_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1950
1951
1952/*
1953 * XADD and LOCK XADD.
1954 */
1955#define EMIT_XADD(a_cBitsWidth, a_Type) \
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1957{ \
1958 a_Type uDst = *puDst; \
1959 a_Type uResult = uDst; \
1960 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1961 *puDst = uResult; \
1962 *puReg = uDst; \
1963} \
1964\
1965IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1966{ \
1967 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1968 a_Type uResult; \
1969 uint32_t fEflTmp; \
1970 do \
1971 { \
1972 uResult = uOld; \
1973 fEflTmp = *pfEFlags; \
1974 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1975 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1976 *puReg = uOld; \
1977 *pfEFlags = fEflTmp; \
1978}
1979EMIT_XADD(64, uint64_t)
1980# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1981EMIT_XADD(32, uint32_t)
1982EMIT_XADD(16, uint16_t)
1983EMIT_XADD(8, uint8_t)
1984# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1985
1986#endif
1987
1988/*
1989 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1990 *
1991 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1992 * instructions are emulated as locked.
1993 */
1994#if defined(IEM_WITHOUT_ASSEMBLY)
1995
1996IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1997{
1998 uint8_t uOld = *puAl;
1999 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2000 Assert(*puAl == uOld);
2001 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2002}
2003
2004
2005IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2006{
2007 uint16_t uOld = *puAx;
2008 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2009 Assert(*puAx == uOld);
2010 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2011}
2012
2013
2014IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2015{
2016 uint32_t uOld = *puEax;
2017 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2018 Assert(*puEax == uOld);
2019 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2020}
2021
2022
2023# if ARCH_BITS == 32
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2025# else
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2027# endif
2028{
2029# if ARCH_BITS == 32
2030 uint64_t const uSrcReg = *puSrcReg;
2031# endif
2032 uint64_t uOld = *puRax;
2033 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2034 Assert(*puRax == uOld);
2035 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2040 uint32_t *pEFlags))
2041{
2042 uint64_t const uNew = pu64EbxEcx->u;
2043 uint64_t const uOld = pu64EaxEdx->u;
2044 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2045 {
2046 Assert(pu64EaxEdx->u == uOld);
2047 *pEFlags |= X86_EFL_ZF;
2048 }
2049 else
2050 *pEFlags &= ~X86_EFL_ZF;
2051}
2052
2053
2054# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2055IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2056 uint32_t *pEFlags))
2057{
2058# ifdef VBOX_STRICT
2059 RTUINT128U const uOld = *pu128RaxRdx;
2060# endif
2061# if defined(RT_ARCH_AMD64)
2062 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2063 &pu128RaxRdx->u))
2064# else
2065 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2066# endif
2067 {
2068 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2069 *pEFlags |= X86_EFL_ZF;
2070 }
2071 else
2072 *pEFlags &= ~X86_EFL_ZF;
2073}
2074# endif
2075
2076#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2077
2078# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2080 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2081{
2082 RTUINT128U u128Tmp = *pu128Dst;
2083 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2084 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2085 {
2086 *pu128Dst = *pu128RbxRcx;
2087 *pEFlags |= X86_EFL_ZF;
2088 }
2089 else
2090 {
2091 *pu128RaxRdx = u128Tmp;
2092 *pEFlags &= ~X86_EFL_ZF;
2093 }
2094}
2095#endif /* !RT_ARCH_ARM64 */
2096
2097#if defined(IEM_WITHOUT_ASSEMBLY)
2098
2099/* Unlocked versions mapped to the locked ones: */
2100
2101IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2102{
2103 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2104}
2105
2106
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2108{
2109 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2110}
2111
2112
2113IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2114{
2115 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2116}
2117
2118
2119# if ARCH_BITS == 32
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2121{
2122 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2123}
2124# else
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2126{
2127 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2128}
2129# endif
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2139 uint32_t *pEFlags))
2140{
2141 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2142}
2143
2144#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2145
2146#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2147 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2148
2149/*
2150 * MUL, IMUL, DIV and IDIV helpers.
2151 *
2152 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2153 * division step so we can select between using C operators and
2154 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2155 *
2156 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2157 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2158 * input loads and the result storing.
2159 */
2160
2161DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2162{
2163# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2164 pQuotient->s.Lo = 0;
2165 pQuotient->s.Hi = 0;
2166# endif
2167 RTUINT128U Divisor;
2168 Divisor.s.Lo = u64Divisor;
2169 Divisor.s.Hi = 0;
2170 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2171}
2172
2173# define DIV_LOAD(a_Dividend) \
2174 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2175# define DIV_LOAD_U8(a_Dividend) \
2176 a_Dividend.u = *puAX
2177
2178# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2179# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2180
2181# define MUL_LOAD_F1() *puA
2182# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2183
2184# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2185# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2186
2187# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2188 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2189# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2190 RTUInt128AssignNeg(&(a_Value))
2191
2192# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2193 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2194# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2195 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2196
2197# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2198 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2199 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2200# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2201 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2202
2203
2204/*
2205 * MUL
2206 */
2207# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2208IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2209{ \
2210 RTUINT ## a_cBitsWidth2x ## U Result; \
2211 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2212 a_fnStore(Result); \
2213 \
2214 /* Calc EFLAGS: */ \
2215 uint32_t fEfl = *pfEFlags; \
2216 if (a_fIntelFlags) \
2217 { /* Intel: 6700K and 10980XE behavior */ \
2218 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2219 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2220 fEfl |= X86_EFL_SF; \
2221 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2222 if (Result.s.Hi != 0) \
2223 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2224 } \
2225 else \
2226 { /* AMD: 3990X */ \
2227 if (Result.s.Hi != 0) \
2228 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2229 else \
2230 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2231 } \
2232 *pfEFlags = fEfl; \
2233 return 0; \
2234} \
2235
2236# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2239 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2240
2241# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2242EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2243 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2244# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2245EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2246 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2247EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2248 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2249EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2250 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2251# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2252# endif /* !DOXYGEN_RUNNING */
2253
2254/*
2255 * MULX
2256 */
2257# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2258IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2259 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2260{ \
2261 RTUINT ## a_cBitsWidth2x ## U Result; \
2262 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2263 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2264 *puDst1 = Result.s.Hi; \
2265} \
2266
2267# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2269EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2270# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2272EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2273# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2274# endif /* !DOXYGEN_RUNNING */
2275
2276
2277/*
2278 * IMUL
2279 *
2280 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2281 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2282 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2283 */
2284# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2285 a_Suffix, a_fIntelFlags) \
2286IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2287{ \
2288 RTUINT ## a_cBitsWidth2x ## U Result; \
2289 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2290 \
2291 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2292 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2293 { \
2294 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2295 { \
2296 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2297 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2298 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2299 } \
2300 else \
2301 { \
2302 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2303 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2304 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2305 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2306 a_fnNeg(Result, a_cBitsWidth2x); \
2307 } \
2308 } \
2309 else \
2310 { \
2311 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2312 { \
2313 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2314 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2315 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2316 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2317 a_fnNeg(Result, a_cBitsWidth2x); \
2318 } \
2319 else \
2320 { \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2322 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2323 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2324 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2325 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2326 } \
2327 } \
2328 a_fnStore(Result); \
2329 \
2330 if (a_fIntelFlags) \
2331 { \
2332 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2333 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2334 fEfl |= X86_EFL_SF; \
2335 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2336 } \
2337 *pfEFlags = fEfl; \
2338 return 0; \
2339}
2340# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2343 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2344
2345# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2346EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2347 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2348# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2349EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2350 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2351EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2352 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2353EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2354 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2355# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2356# endif /* !DOXYGEN_RUNNING */
2357
2358
2359/*
2360 * IMUL with two operands are mapped onto the three operand variant, ignoring
2361 * the high part of the product.
2362 */
2363# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2364IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2365{ \
2366 a_uType uIgn; \
2367 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2368} \
2369\
2370IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2371{ \
2372 a_uType uIgn; \
2373 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2374} \
2375\
2376IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2377{ \
2378 a_uType uIgn; \
2379 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2380}
2381
2382EMIT_IMUL_TWO(64, uint64_t)
2383# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2384EMIT_IMUL_TWO(32, uint32_t)
2385EMIT_IMUL_TWO(16, uint16_t)
2386# endif
2387
2388
2389/*
2390 * DIV
2391 */
2392# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2393 a_Suffix, a_fIntelFlags) \
2394IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2395{ \
2396 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2397 a_fnLoad(Dividend); \
2398 if ( uDivisor != 0 \
2399 && Dividend.s.Hi < uDivisor) \
2400 { \
2401 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2402 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2403 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2404 \
2405 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2406 if (!a_fIntelFlags) \
2407 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2408 return 0; \
2409 } \
2410 /* #DE */ \
2411 return -1; \
2412}
2413# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2416 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2417
2418# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2419EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2420 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2422EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2423 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2424EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2425 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2426EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2427 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2428# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2429# endif /* !DOXYGEN_RUNNING */
2430
2431
2432/*
2433 * IDIV
2434 *
2435 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2436 * set AF and clear PF, ZF and SF just like it does for DIV.
2437 *
2438 */
2439# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2440 a_Suffix, a_fIntelFlags) \
2441IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2442{ \
2443 /* Note! Skylake leaves all flags alone. */ \
2444 \
2445 /** @todo overflow checks */ \
2446 if (uDivisor != 0) \
2447 { \
2448 /* \
2449 * Convert to unsigned division. \
2450 */ \
2451 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2452 a_fnLoad(Dividend); \
2453 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2454 if (fSignedDividend) \
2455 a_fnNeg(Dividend, a_cBitsWidth2x); \
2456 \
2457 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2458 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2459 uDivisorPositive = uDivisor; \
2460 else \
2461 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2462 \
2463 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2464 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2465 \
2466 /* \
2467 * Setup the result, checking for overflows. \
2468 */ \
2469 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2470 { \
2471 if (!fSignedDividend) \
2472 { \
2473 /* Positive divisor, positive dividend => result positive. */ \
2474 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2475 { \
2476 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2477 if (!a_fIntelFlags) \
2478 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2479 return 0; \
2480 } \
2481 } \
2482 else \
2483 { \
2484 /* Positive divisor, negative dividend => result negative. */ \
2485 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2486 { \
2487 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2488 if (!a_fIntelFlags) \
2489 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2490 return 0; \
2491 } \
2492 } \
2493 } \
2494 else \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2500 { \
2501 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2511 { \
2512 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 } \
2520 /* #DE */ \
2521 return -1; \
2522}
2523# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2526 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2530 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2533 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2534EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2535 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2536EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2537 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2542
2543
2544/*********************************************************************************************************************************
2545* Unary operations. *
2546*********************************************************************************************************************************/
2547#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2548
2549/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2550 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2551 *
2552 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2553 * borrowing in arithmetic loops on intel 8008).
2554 *
2555 * @returns Status bits.
2556 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2557 * @param a_uResult Unsigned result value.
2558 * @param a_uDst The original destination value (for AF calc).
2559 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2560 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2561 */
2562#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2563 do { \
2564 uint32_t fEflTmp = *(a_pfEFlags); \
2565 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2566 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2567 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2568 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2569 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2570 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2571 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2572 *(a_pfEFlags) = fEflTmp; \
2573 } while (0)
2574
2575/*
2576 * INC
2577 */
2578
2579IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2580{
2581 uint64_t uDst = *puDst;
2582 uint64_t uResult = uDst + 1;
2583 *puDst = uResult;
2584 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2585}
2586
2587# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2588
2589IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2590{
2591 uint32_t uDst = *puDst;
2592 uint32_t uResult = uDst + 1;
2593 *puDst = uResult;
2594 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2595}
2596
2597
2598IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2599{
2600 uint16_t uDst = *puDst;
2601 uint16_t uResult = uDst + 1;
2602 *puDst = uResult;
2603 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2604}
2605
2606IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2607{
2608 uint8_t uDst = *puDst;
2609 uint8_t uResult = uDst + 1;
2610 *puDst = uResult;
2611 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2612}
2613
2614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2615
2616
2617/*
2618 * DEC
2619 */
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint64_t uDst = *puDst;
2624 uint64_t uResult = uDst - 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2627}
2628
2629# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint32_t uDst = *puDst;
2634 uint32_t uResult = uDst - 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2637}
2638
2639
2640IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2641{
2642 uint16_t uDst = *puDst;
2643 uint16_t uResult = uDst - 1;
2644 *puDst = uResult;
2645 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2646}
2647
2648
2649IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2650{
2651 uint8_t uDst = *puDst;
2652 uint8_t uResult = uDst - 1;
2653 *puDst = uResult;
2654 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2655}
2656
2657# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2658
2659
2660/*
2661 * NOT
2662 */
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint64_t uDst = *puDst;
2667 uint64_t uResult = ~uDst;
2668 *puDst = uResult;
2669 /* EFLAGS are not modified. */
2670 RT_NOREF_PV(pfEFlags);
2671}
2672
2673# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2674
2675IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2676{
2677 uint32_t uDst = *puDst;
2678 uint32_t uResult = ~uDst;
2679 *puDst = uResult;
2680 /* EFLAGS are not modified. */
2681 RT_NOREF_PV(pfEFlags);
2682}
2683
2684IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2685{
2686 uint16_t uDst = *puDst;
2687 uint16_t uResult = ~uDst;
2688 *puDst = uResult;
2689 /* EFLAGS are not modified. */
2690 RT_NOREF_PV(pfEFlags);
2691}
2692
2693IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2694{
2695 uint8_t uDst = *puDst;
2696 uint8_t uResult = ~uDst;
2697 *puDst = uResult;
2698 /* EFLAGS are not modified. */
2699 RT_NOREF_PV(pfEFlags);
2700}
2701
2702# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2703
2704
2705/*
2706 * NEG
2707 */
2708
2709/**
2710 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2711 *
2712 * @returns Status bits.
2713 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2714 * @param a_uResult Unsigned result value.
2715 * @param a_uDst The original destination value (for AF calc).
2716 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2717 */
2718#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2719 do { \
2720 uint32_t fEflTmp = *(a_pfEFlags); \
2721 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2722 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2723 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2724 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2725 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2726 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2727 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2728 *(a_pfEFlags) = fEflTmp; \
2729 } while (0)
2730
2731IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2732{
2733 uint64_t uDst = *puDst;
2734 uint64_t uResult = (uint64_t)0 - uDst;
2735 *puDst = uResult;
2736 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2737}
2738
2739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2740
2741IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2742{
2743 uint32_t uDst = *puDst;
2744 uint32_t uResult = (uint32_t)0 - uDst;
2745 *puDst = uResult;
2746 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2747}
2748
2749
2750IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2751{
2752 uint16_t uDst = *puDst;
2753 uint16_t uResult = (uint16_t)0 - uDst;
2754 *puDst = uResult;
2755 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2756}
2757
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint8_t uDst = *puDst;
2762 uint8_t uResult = (uint8_t)0 - uDst;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2765}
2766
2767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2768
2769/*
2770 * Locked variants.
2771 */
2772
2773/** Emit a function for doing a locked unary operand operation. */
2774# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2775 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2776 uint32_t *pfEFlags)) \
2777 { \
2778 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2779 uint ## a_cBitsWidth ## _t uTmp; \
2780 uint32_t fEflTmp; \
2781 do \
2782 { \
2783 uTmp = uOld; \
2784 fEflTmp = *pfEFlags; \
2785 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2786 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2787 *pfEFlags = fEflTmp; \
2788 }
2789
2790EMIT_LOCKED_UNARY_OP(inc, 64)
2791EMIT_LOCKED_UNARY_OP(dec, 64)
2792EMIT_LOCKED_UNARY_OP(not, 64)
2793EMIT_LOCKED_UNARY_OP(neg, 64)
2794# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2795EMIT_LOCKED_UNARY_OP(inc, 32)
2796EMIT_LOCKED_UNARY_OP(dec, 32)
2797EMIT_LOCKED_UNARY_OP(not, 32)
2798EMIT_LOCKED_UNARY_OP(neg, 32)
2799
2800EMIT_LOCKED_UNARY_OP(inc, 16)
2801EMIT_LOCKED_UNARY_OP(dec, 16)
2802EMIT_LOCKED_UNARY_OP(not, 16)
2803EMIT_LOCKED_UNARY_OP(neg, 16)
2804
2805EMIT_LOCKED_UNARY_OP(inc, 8)
2806EMIT_LOCKED_UNARY_OP(dec, 8)
2807EMIT_LOCKED_UNARY_OP(not, 8)
2808EMIT_LOCKED_UNARY_OP(neg, 8)
2809# endif
2810
2811#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2812
2813
2814/*********************************************************************************************************************************
2815* Shifting and Rotating *
2816*********************************************************************************************************************************/
2817
2818/*
2819 * ROL
2820 */
2821#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2822IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2823{ \
2824 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2825 if (cShift) \
2826 { \
2827 if (a_cBitsWidth < 32) \
2828 cShift &= a_cBitsWidth - 1; \
2829 a_uType const uDst = *puDst; \
2830 a_uType const uResult = a_fnHlp(uDst, cShift); \
2831 *puDst = uResult; \
2832 \
2833 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2834 it the same way as for 1 bit shifts. */ \
2835 AssertCompile(X86_EFL_CF_BIT == 0); \
2836 uint32_t fEfl = *pfEFlags; \
2837 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2838 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2839 fEfl |= fCarry; \
2840 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2841 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2842 else /* Intel 10980XE: According to the first sub-shift: */ \
2843 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2844 *pfEFlags = fEfl; \
2845 } \
2846}
2847
2848#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2849EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2850#endif
2851EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2852EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2853
2854#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2855EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2856#endif
2857EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2858EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2859
2860DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2861{
2862 return (uValue << cShift) | (uValue >> (16 - cShift));
2863}
2864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2865EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2866#endif
2867EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2868EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2869
2870DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2871{
2872 return (uValue << cShift) | (uValue >> (8 - cShift));
2873}
2874#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2875EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2876#endif
2877EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2878EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2879
2880
2881/*
2882 * ROR
2883 */
2884#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2885IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2886{ \
2887 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2888 if (cShift) \
2889 { \
2890 if (a_cBitsWidth < 32) \
2891 cShift &= a_cBitsWidth - 1; \
2892 a_uType const uDst = *puDst; \
2893 a_uType const uResult = a_fnHlp(uDst, cShift); \
2894 *puDst = uResult; \
2895 \
2896 /* Calc EFLAGS: */ \
2897 AssertCompile(X86_EFL_CF_BIT == 0); \
2898 uint32_t fEfl = *pfEFlags; \
2899 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2900 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2901 fEfl |= fCarry; \
2902 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2903 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2904 else /* Intel 10980XE: According to the first sub-shift: */ \
2905 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2906 *pfEFlags = fEfl; \
2907 } \
2908}
2909
2910#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2911EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2912#endif
2913EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2914EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2915
2916#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2917EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2918#endif
2919EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2920EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2921
2922DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2923{
2924 return (uValue >> cShift) | (uValue << (16 - cShift));
2925}
2926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2927EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2928#endif
2929EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2930EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2931
2932DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2933{
2934 return (uValue >> cShift) | (uValue << (8 - cShift));
2935}
2936#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2937EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2938#endif
2939EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2940EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2941
2942
2943/*
2944 * RCL
2945 */
2946#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2947IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2948{ \
2949 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2950 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2951 cShift %= a_cBitsWidth + 1; \
2952 if (cShift) \
2953 { \
2954 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2955 cShift %= a_cBitsWidth + 1; \
2956 a_uType const uDst = *puDst; \
2957 a_uType uResult = uDst << cShift; \
2958 if (cShift > 1) \
2959 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2960 \
2961 AssertCompile(X86_EFL_CF_BIT == 0); \
2962 uint32_t fEfl = *pfEFlags; \
2963 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2964 uResult |= (a_uType)fInCarry << (cShift - 1); \
2965 \
2966 *puDst = uResult; \
2967 \
2968 /* Calc EFLAGS. */ \
2969 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2970 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2971 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2972 fEfl |= fOutCarry; \
2973 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2974 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2975 else /* Intel 10980XE: According to the first sub-shift: */ \
2976 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2977 *pfEFlags = fEfl; \
2978 } \
2979}
2980
2981#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2982EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2983#endif
2984EMIT_RCL(64, uint64_t, _intel, 1)
2985EMIT_RCL(64, uint64_t, _amd, 0)
2986
2987#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2988EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2989#endif
2990EMIT_RCL(32, uint32_t, _intel, 1)
2991EMIT_RCL(32, uint32_t, _amd, 0)
2992
2993#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2994EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2995#endif
2996EMIT_RCL(16, uint16_t, _intel, 1)
2997EMIT_RCL(16, uint16_t, _amd, 0)
2998
2999#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3001#endif
3002EMIT_RCL(8, uint8_t, _intel, 1)
3003EMIT_RCL(8, uint8_t, _amd, 0)
3004
3005
3006/*
3007 * RCR
3008 */
3009#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3010IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3011{ \
3012 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3013 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3014 cShift %= a_cBitsWidth + 1; \
3015 if (cShift) \
3016 { \
3017 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3018 cShift %= a_cBitsWidth + 1; \
3019 a_uType const uDst = *puDst; \
3020 a_uType uResult = uDst >> cShift; \
3021 if (cShift > 1) \
3022 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3023 \
3024 AssertCompile(X86_EFL_CF_BIT == 0); \
3025 uint32_t fEfl = *pfEFlags; \
3026 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3027 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3028 *puDst = uResult; \
3029 \
3030 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3031 it the same way as for 1 bit shifts. */ \
3032 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3033 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3034 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3035 fEfl |= fOutCarry; \
3036 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3037 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3038 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3039 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3040 *pfEFlags = fEfl; \
3041 } \
3042}
3043
3044#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3046#endif
3047EMIT_RCR(64, uint64_t, _intel, 1)
3048EMIT_RCR(64, uint64_t, _amd, 0)
3049
3050#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3051EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3052#endif
3053EMIT_RCR(32, uint32_t, _intel, 1)
3054EMIT_RCR(32, uint32_t, _amd, 0)
3055
3056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3057EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3058#endif
3059EMIT_RCR(16, uint16_t, _intel, 1)
3060EMIT_RCR(16, uint16_t, _amd, 0)
3061
3062#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3063EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3064#endif
3065EMIT_RCR(8, uint8_t, _intel, 1)
3066EMIT_RCR(8, uint8_t, _amd, 0)
3067
3068
3069/*
3070 * SHL
3071 */
3072#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3073IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3074{ \
3075 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3076 if (cShift) \
3077 { \
3078 a_uType const uDst = *puDst; \
3079 a_uType uResult = uDst << cShift; \
3080 *puDst = uResult; \
3081 \
3082 /* Calc EFLAGS. */ \
3083 AssertCompile(X86_EFL_CF_BIT == 0); \
3084 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3085 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3086 fEfl |= fCarry; \
3087 if (!a_fIntelFlags) \
3088 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3089 else \
3090 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3091 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3092 fEfl |= X86_EFL_CALC_ZF(uResult); \
3093 fEfl |= g_afParity[uResult & 0xff]; \
3094 if (!a_fIntelFlags) \
3095 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3096 *pfEFlags = fEfl; \
3097 } \
3098}
3099
3100#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3101EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3102#endif
3103EMIT_SHL(64, uint64_t, _intel, 1)
3104EMIT_SHL(64, uint64_t, _amd, 0)
3105
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3108#endif
3109EMIT_SHL(32, uint32_t, _intel, 1)
3110EMIT_SHL(32, uint32_t, _amd, 0)
3111
3112#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3113EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3114#endif
3115EMIT_SHL(16, uint16_t, _intel, 1)
3116EMIT_SHL(16, uint16_t, _amd, 0)
3117
3118#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3119EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3120#endif
3121EMIT_SHL(8, uint8_t, _intel, 1)
3122EMIT_SHL(8, uint8_t, _amd, 0)
3123
3124
3125/*
3126 * SHR
3127 */
3128#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3129IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3130{ \
3131 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3132 if (cShift) \
3133 { \
3134 a_uType const uDst = *puDst; \
3135 a_uType uResult = uDst >> cShift; \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 AssertCompile(X86_EFL_CF_BIT == 0); \
3140 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3141 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3142 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3143 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3144 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3145 fEfl |= X86_EFL_CALC_ZF(uResult); \
3146 fEfl |= g_afParity[uResult & 0xff]; \
3147 if (!a_fIntelFlags) \
3148 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3149 *pfEFlags = fEfl; \
3150 } \
3151}
3152
3153#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3154EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3155#endif
3156EMIT_SHR(64, uint64_t, _intel, 1)
3157EMIT_SHR(64, uint64_t, _amd, 0)
3158
3159#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3160EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3161#endif
3162EMIT_SHR(32, uint32_t, _intel, 1)
3163EMIT_SHR(32, uint32_t, _amd, 0)
3164
3165#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3166EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3167#endif
3168EMIT_SHR(16, uint16_t, _intel, 1)
3169EMIT_SHR(16, uint16_t, _amd, 0)
3170
3171#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3172EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3173#endif
3174EMIT_SHR(8, uint8_t, _intel, 1)
3175EMIT_SHR(8, uint8_t, _amd, 0)
3176
3177
3178/*
3179 * SAR
3180 */
3181#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3182IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3183{ \
3184 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3185 if (cShift) \
3186 { \
3187 a_iType const iDst = (a_iType)*puDst; \
3188 a_uType uResult = iDst >> cShift; \
3189 *puDst = uResult; \
3190 \
3191 /* Calc EFLAGS. \
3192 Note! The OF flag is always zero because the result never differs from the input. */ \
3193 AssertCompile(X86_EFL_CF_BIT == 0); \
3194 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3195 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3196 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3197 fEfl |= X86_EFL_CALC_ZF(uResult); \
3198 fEfl |= g_afParity[uResult & 0xff]; \
3199 if (!a_fIntelFlags) \
3200 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3201 *pfEFlags = fEfl; \
3202 } \
3203}
3204
3205#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3206EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3207#endif
3208EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3209EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3210
3211#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3212EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3213#endif
3214EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3215EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3216
3217#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3218EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3219#endif
3220EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3221EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3222
3223#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3224EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3225#endif
3226EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3227EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3228
3229
3230/*
3231 * SHLD
3232 *
3233 * - CF is the last bit shifted out of puDst.
3234 * - AF is always cleared by Intel 10980XE.
3235 * - AF is always set by AMD 3990X.
3236 * - OF is set according to the first shift on Intel 10980XE, it seems.
3237 * - OF is set according to the last sub-shift on AMD 3990X.
3238 * - ZF, SF and PF are calculated according to the result by both vendors.
3239 *
3240 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3241 * pick either the source register or the destination register for input bits
3242 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3243 * intel has changed behaviour here several times. We implement what current
3244 * skylake based does for now, we can extend this later as needed.
3245 */
3246#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3247IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3248 uint32_t *pfEFlags)) \
3249{ \
3250 cShift &= a_cBitsWidth - 1; \
3251 if (cShift) \
3252 { \
3253 a_uType const uDst = *puDst; \
3254 a_uType uResult = uDst << cShift; \
3255 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3256 *puDst = uResult; \
3257 \
3258 /* CALC EFLAGS: */ \
3259 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3260 if (a_fIntelFlags) \
3261 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3262 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3263 else \
3264 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3265 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3266 fEfl |= X86_EFL_AF; \
3267 } \
3268 AssertCompile(X86_EFL_CF_BIT == 0); \
3269 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3270 fEfl |= g_afParity[uResult & 0xff]; \
3271 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3272 fEfl |= X86_EFL_CALC_ZF(uResult); \
3273 *pfEFlags = fEfl; \
3274 } \
3275}
3276
3277#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3278EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3279#endif
3280EMIT_SHLD(64, uint64_t, _intel, 1)
3281EMIT_SHLD(64, uint64_t, _amd, 0)
3282
3283#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHLD(32, uint32_t, _intel, 1)
3287EMIT_SHLD(32, uint32_t, _amd, 0)
3288
3289#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3290IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3291{ \
3292 cShift &= 31; \
3293 if (cShift) \
3294 { \
3295 uint16_t const uDst = *puDst; \
3296 uint64_t const uTmp = a_fIntelFlags \
3297 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3298 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3299 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3300 *puDst = uResult; \
3301 \
3302 /* CALC EFLAGS: */ \
3303 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3304 AssertCompile(X86_EFL_CF_BIT == 0); \
3305 if (a_fIntelFlags) \
3306 { \
3307 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3308 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3309 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3310 } \
3311 else \
3312 { \
3313 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3314 if (cShift < 16) \
3315 { \
3316 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3317 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3318 } \
3319 else \
3320 { \
3321 if (cShift == 16) \
3322 fEfl |= uDst & X86_EFL_CF; \
3323 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3324 } \
3325 fEfl |= X86_EFL_AF; \
3326 } \
3327 fEfl |= g_afParity[uResult & 0xff]; \
3328 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3329 fEfl |= X86_EFL_CALC_ZF(uResult); \
3330 *pfEFlags = fEfl; \
3331 } \
3332}
3333
3334#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3335EMIT_SHLD_16(RT_NOTHING, 1)
3336#endif
3337EMIT_SHLD_16(_intel, 1)
3338EMIT_SHLD_16(_amd, 0)
3339
3340
3341/*
3342 * SHRD
3343 *
3344 * EFLAGS behaviour seems to be the same as with SHLD:
3345 * - CF is the last bit shifted out of puDst.
3346 * - AF is always cleared by Intel 10980XE.
3347 * - AF is always set by AMD 3990X.
3348 * - OF is set according to the first shift on Intel 10980XE, it seems.
3349 * - OF is set according to the last sub-shift on AMD 3990X.
3350 * - ZF, SF and PF are calculated according to the result by both vendors.
3351 *
3352 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3353 * pick either the source register or the destination register for input bits
3354 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3355 * intel has changed behaviour here several times. We implement what current
3356 * skylake based does for now, we can extend this later as needed.
3357 */
3358#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3359IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3360{ \
3361 cShift &= a_cBitsWidth - 1; \
3362 if (cShift) \
3363 { \
3364 a_uType const uDst = *puDst; \
3365 a_uType uResult = uDst >> cShift; \
3366 uResult |= uSrc << (a_cBitsWidth - cShift); \
3367 *puDst = uResult; \
3368 \
3369 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3370 AssertCompile(X86_EFL_CF_BIT == 0); \
3371 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3372 if (a_fIntelFlags) \
3373 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3374 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3375 else \
3376 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3377 if (cShift > 1) /* Set according to last shift. */ \
3378 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3379 else \
3380 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3381 fEfl |= X86_EFL_AF; \
3382 } \
3383 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3384 fEfl |= X86_EFL_CALC_ZF(uResult); \
3385 fEfl |= g_afParity[uResult & 0xff]; \
3386 *pfEFlags = fEfl; \
3387 } \
3388}
3389
3390#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3391EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3392#endif
3393EMIT_SHRD(64, uint64_t, _intel, 1)
3394EMIT_SHRD(64, uint64_t, _amd, 0)
3395
3396#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3397EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3398#endif
3399EMIT_SHRD(32, uint32_t, _intel, 1)
3400EMIT_SHRD(32, uint32_t, _amd, 0)
3401
3402#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3403IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3404{ \
3405 cShift &= 31; \
3406 if (cShift) \
3407 { \
3408 uint16_t const uDst = *puDst; \
3409 uint64_t const uTmp = a_fIntelFlags \
3410 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3411 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3412 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3413 *puDst = uResult; \
3414 \
3415 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3416 AssertCompile(X86_EFL_CF_BIT == 0); \
3417 if (a_fIntelFlags) \
3418 { \
3419 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3420 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3421 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3422 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3423 } \
3424 else \
3425 { \
3426 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3427 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3428 /* AMD 3990X: Set according to last shift. AF always set. */ \
3429 if (cShift > 1) /* Set according to last shift. */ \
3430 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3431 else \
3432 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3433 fEfl |= X86_EFL_AF; \
3434 } \
3435 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3436 fEfl |= X86_EFL_CALC_ZF(uResult); \
3437 fEfl |= g_afParity[uResult & 0xff]; \
3438 *pfEFlags = fEfl; \
3439 } \
3440}
3441
3442#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHRD_16(RT_NOTHING, 1)
3444#endif
3445EMIT_SHRD_16(_intel, 1)
3446EMIT_SHRD_16(_amd, 0)
3447
3448
3449/*
3450 * RORX (BMI2)
3451 */
3452#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3453IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3454{ \
3455 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3456}
3457
3458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3460#endif
3461#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3462EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3463#endif
3464
3465
3466/*
3467 * SHLX (BMI2)
3468 */
3469#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3470IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3471{ \
3472 cShift &= a_cBitsWidth - 1; \
3473 *puDst = uSrc << cShift; \
3474}
3475
3476#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_SHLX(64, uint64_t, RT_NOTHING)
3478EMIT_SHLX(64, uint64_t, _fallback)
3479#endif
3480#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3481EMIT_SHLX(32, uint32_t, RT_NOTHING)
3482EMIT_SHLX(32, uint32_t, _fallback)
3483#endif
3484
3485
3486/*
3487 * SHRX (BMI2)
3488 */
3489#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3490IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3491{ \
3492 cShift &= a_cBitsWidth - 1; \
3493 *puDst = uSrc >> cShift; \
3494}
3495
3496#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SHRX(64, uint64_t, RT_NOTHING)
3498EMIT_SHRX(64, uint64_t, _fallback)
3499#endif
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHRX(32, uint32_t, RT_NOTHING)
3502EMIT_SHRX(32, uint32_t, _fallback)
3503#endif
3504
3505
3506/*
3507 * SARX (BMI2)
3508 */
3509#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3510IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3511{ \
3512 cShift &= a_cBitsWidth - 1; \
3513 *puDst = (a_iType)uSrc >> cShift; \
3514}
3515
3516#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3517EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3518EMIT_SARX(64, uint64_t, int64_t, _fallback)
3519#endif
3520#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3521EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3522EMIT_SARX(32, uint32_t, int32_t, _fallback)
3523#endif
3524
3525
3526/*
3527 * PDEP (BMI2)
3528 */
3529#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3530IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3531{ \
3532 a_uType uResult = 0; \
3533 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3534 if (fMask & ((a_uType)1 << iMaskBit)) \
3535 { \
3536 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3537 iBit++; \
3538 } \
3539 *puDst = uResult; \
3540}
3541
3542#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3543EMIT_PDEP(64, uint64_t, RT_NOTHING)
3544#endif
3545EMIT_PDEP(64, uint64_t, _fallback)
3546#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3547EMIT_PDEP(32, uint32_t, RT_NOTHING)
3548#endif
3549EMIT_PDEP(32, uint32_t, _fallback)
3550
3551/*
3552 * PEXT (BMI2)
3553 */
3554#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PEXT(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PEXT(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PEXT(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PEXT(32, uint32_t, _fallback)
3575
3576
3577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3578
3579# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3580/*
3581 * BSWAP
3582 */
3583
3584IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3585{
3586 *puDst = ASMByteSwapU64(*puDst);
3587}
3588
3589
3590IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3591{
3592 *puDst = ASMByteSwapU32(*puDst);
3593}
3594
3595
3596/* Note! undocument, so 32-bit arg */
3597IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3598{
3599#if 0
3600 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3601#else
3602 /* This is the behaviour AMD 3990x (64-bit mode): */
3603 *(uint16_t *)puDst = 0;
3604#endif
3605}
3606
3607# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3608
3609
3610
3611# if defined(IEM_WITHOUT_ASSEMBLY)
3612
3613/*
3614 * LFENCE, SFENCE & MFENCE.
3615 */
3616
3617IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3618{
3619 ASMReadFence();
3620}
3621
3622
3623IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3624{
3625 ASMWriteFence();
3626}
3627
3628
3629IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3630{
3631 ASMMemoryFence();
3632}
3633
3634
3635# ifndef RT_ARCH_ARM64
3636IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3637{
3638 ASMMemoryFence();
3639}
3640# endif
3641
3642# endif
3643
3644#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3645
3646
3647IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3648{
3649 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3650 {
3651 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3652 *pu16Dst |= u16Src & X86_SEL_RPL;
3653
3654 *pfEFlags |= X86_EFL_ZF;
3655 }
3656 else
3657 *pfEFlags &= ~X86_EFL_ZF;
3658}
3659
3660
3661#if defined(IEM_WITHOUT_ASSEMBLY)
3662
3663/*********************************************************************************************************************************
3664* x87 FPU Loads *
3665*********************************************************************************************************************************/
3666
3667IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3668{
3669 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3670 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3671 {
3672 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3673 pFpuRes->r80Result.sj64.fInteger = 1;
3674 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3675 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3676 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3677 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3678 }
3679 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3680 {
3681 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3682 pFpuRes->r80Result.s.uExponent = 0;
3683 pFpuRes->r80Result.s.uMantissa = 0;
3684 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3685 }
3686 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3687 {
3688 /* Subnormal values gets normalized. */
3689 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3690 pFpuRes->r80Result.sj64.fInteger = 1;
3691 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3692 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3693 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3694 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3695 pFpuRes->FSW |= X86_FSW_DE;
3696 if (!(pFpuState->FCW & X86_FCW_DM))
3697 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3698 }
3699 else if (RTFLOAT32U_IS_INF(pr32Val))
3700 {
3701 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3702 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3703 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3704 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3705 }
3706 else
3707 {
3708 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3709 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3710 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3711 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3712 pFpuRes->r80Result.sj64.fInteger = 1;
3713 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3715 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3716 {
3717 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3718 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3719 pFpuRes->FSW |= X86_FSW_IE;
3720
3721 if (!(pFpuState->FCW & X86_FCW_IM))
3722 {
3723 /* The value is not pushed. */
3724 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3725 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3726 pFpuRes->r80Result.au64[0] = 0;
3727 pFpuRes->r80Result.au16[4] = 0;
3728 }
3729 }
3730 else
3731 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3732 }
3733}
3734
3735
3736IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3737{
3738 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3739 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3740 {
3741 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3742 pFpuRes->r80Result.sj64.fInteger = 1;
3743 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3744 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3745 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3746 }
3747 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3748 {
3749 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3750 pFpuRes->r80Result.s.uExponent = 0;
3751 pFpuRes->r80Result.s.uMantissa = 0;
3752 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3753 }
3754 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3755 {
3756 /* Subnormal values gets normalized. */
3757 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3758 pFpuRes->r80Result.sj64.fInteger = 1;
3759 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3760 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3761 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3762 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3763 pFpuRes->FSW |= X86_FSW_DE;
3764 if (!(pFpuState->FCW & X86_FCW_DM))
3765 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3766 }
3767 else if (RTFLOAT64U_IS_INF(pr64Val))
3768 {
3769 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3770 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3771 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3772 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3773 }
3774 else
3775 {
3776 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3777 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3778 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3779 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3780 pFpuRes->r80Result.sj64.fInteger = 1;
3781 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3782 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3783 {
3784 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3785 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3786 pFpuRes->FSW |= X86_FSW_IE;
3787
3788 if (!(pFpuState->FCW & X86_FCW_IM))
3789 {
3790 /* The value is not pushed. */
3791 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3792 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3793 pFpuRes->r80Result.au64[0] = 0;
3794 pFpuRes->r80Result.au16[4] = 0;
3795 }
3796 }
3797 else
3798 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3799 }
3800}
3801
3802
3803IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3804{
3805 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3806 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3807 /* Raises no exceptions. */
3808 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3813{
3814 pFpuRes->r80Result.sj64.fSign = 0;
3815 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3816 pFpuRes->r80Result.sj64.fInteger = 1;
3817 pFpuRes->r80Result.sj64.uFraction = 0;
3818
3819 /*
3820 * FPU status word:
3821 * - TOP is irrelevant, but we must match x86 assembly version.
3822 * - C1 is always cleared as we don't have any stack overflows.
3823 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3824 */
3825 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3826}
3827
3828
3829IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3830{
3831 pFpuRes->r80Result.sj64.fSign = 0;
3832 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3833 pFpuRes->r80Result.sj64.fInteger = 1;
3834 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3835 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3836 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838}
3839
3840
3841IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3842{
3843 pFpuRes->r80Result.sj64.fSign = 0;
3844 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3845 pFpuRes->r80Result.sj64.fInteger = 1;
3846 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3847 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3848 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3849}
3850
3851
3852IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3853{
3854 pFpuRes->r80Result.sj64.fSign = 0;
3855 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3858 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3859 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3860 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3861}
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3865{
3866 pFpuRes->r80Result.sj64.fSign = 0;
3867 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3868 pFpuRes->r80Result.sj64.fInteger = 1;
3869 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3870 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3871 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3872 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3873}
3874
3875
3876IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3877{
3878 pFpuRes->r80Result.sj64.fSign = 0;
3879 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3882 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3883 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3884 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3885}
3886
3887
3888IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3889{
3890 pFpuRes->r80Result.s.fSign = 0;
3891 pFpuRes->r80Result.s.uExponent = 0;
3892 pFpuRes->r80Result.s.uMantissa = 0;
3893 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3894}
3895
3896#define EMIT_FILD(a_cBits) \
3897IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3898 int ## a_cBits ## _t const *piVal)) \
3899{ \
3900 int ## a_cBits ## _t iVal = *piVal; \
3901 if (iVal == 0) \
3902 { \
3903 pFpuRes->r80Result.s.fSign = 0; \
3904 pFpuRes->r80Result.s.uExponent = 0; \
3905 pFpuRes->r80Result.s.uMantissa = 0; \
3906 } \
3907 else \
3908 { \
3909 if (iVal > 0) \
3910 pFpuRes->r80Result.s.fSign = 0; \
3911 else \
3912 { \
3913 pFpuRes->r80Result.s.fSign = 1; \
3914 iVal = -iVal; \
3915 } \
3916 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3917 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3918 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3919 } \
3920 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3921}
3922EMIT_FILD(16)
3923EMIT_FILD(32)
3924EMIT_FILD(64)
3925
3926
3927IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3928{
3929 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3930 if ( pd80Val->s.abPairs[0] == 0
3931 && pd80Val->s.abPairs[1] == 0
3932 && pd80Val->s.abPairs[2] == 0
3933 && pd80Val->s.abPairs[3] == 0
3934 && pd80Val->s.abPairs[4] == 0
3935 && pd80Val->s.abPairs[5] == 0
3936 && pd80Val->s.abPairs[6] == 0
3937 && pd80Val->s.abPairs[7] == 0
3938 && pd80Val->s.abPairs[8] == 0)
3939 {
3940 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3941 pFpuRes->r80Result.s.uExponent = 0;
3942 pFpuRes->r80Result.s.uMantissa = 0;
3943 }
3944 else
3945 {
3946 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3947
3948 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3949 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3950 cPairs--;
3951
3952 uint64_t uVal = 0;
3953 uint64_t uFactor = 1;
3954 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3955 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3956 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3957
3958 unsigned const cBits = ASMBitLastSetU64(uVal);
3959 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3960 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3961 }
3962}
3963
3964
3965/*********************************************************************************************************************************
3966* x87 FPU Stores *
3967*********************************************************************************************************************************/
3968
3969/**
3970 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3971 *
3972 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3973 *
3974 * @returns Updated FPU status word value.
3975 * @param fSignIn Incoming sign indicator.
3976 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3977 * @param iExponentIn Unbiased exponent.
3978 * @param fFcw The FPU control word.
3979 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3980 * @param pr32Dst Where to return the output value, if one should be
3981 * returned.
3982 *
3983 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3984 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3985 */
3986static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3987 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3988{
3989 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3990 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3991 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3992 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3993 ? fRoundingOffMask
3994 : 0;
3995 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3996
3997 /*
3998 * Deal with potential overflows/underflows first, optimizing for none.
3999 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4000 */
4001 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4002 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4003 { /* likely? */ }
4004 /*
4005 * Underflow if the exponent zero or negative. This is attempted mapped
4006 * to a subnormal number when possible, with some additional trickery ofc.
4007 */
4008 else if (iExponentOut <= 0)
4009 {
4010 bool const fIsTiny = iExponentOut < 0
4011 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4012 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4013 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4014 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4015
4016 if (iExponentOut <= 0)
4017 {
4018 uMantissaIn = iExponentOut <= -63
4019 ? uMantissaIn != 0
4020 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4021 fRoundedOff = uMantissaIn & fRoundingOffMask;
4022 if (fRoundedOff && fIsTiny)
4023 fFsw |= X86_FSW_UE;
4024 iExponentOut = 0;
4025 }
4026 }
4027 /*
4028 * Overflow if at or above max exponent value or if we will reach max
4029 * when rounding. Will return +/-zero or +/-max value depending on
4030 * whether we're rounding or not.
4031 */
4032 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4033 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4034 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4035 {
4036 fFsw |= X86_FSW_OE;
4037 if (!(fFcw & X86_FCW_OM))
4038 return fFsw | X86_FSW_ES | X86_FSW_B;
4039 fFsw |= X86_FSW_PE;
4040 if (uRoundingAdd)
4041 fFsw |= X86_FSW_C1;
4042 if (!(fFcw & X86_FCW_PM))
4043 fFsw |= X86_FSW_ES | X86_FSW_B;
4044
4045 pr32Dst->s.fSign = fSignIn;
4046 if (uRoundingAdd)
4047 { /* Zero */
4048 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4049 pr32Dst->s.uFraction = 0;
4050 }
4051 else
4052 { /* Max */
4053 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4054 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4055 }
4056 return fFsw;
4057 }
4058
4059 /*
4060 * Normal or subnormal number.
4061 */
4062 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4063 uint64_t uMantissaOut = uMantissaIn;
4064 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4065 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4066 || fRoundedOff != uRoundingAdd)
4067 {
4068 uMantissaOut = uMantissaIn + uRoundingAdd;
4069 if (uMantissaOut >= uMantissaIn)
4070 { /* likely */ }
4071 else
4072 {
4073 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4074 iExponentOut++;
4075 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4076 fFsw |= X86_FSW_C1;
4077 }
4078 }
4079 else
4080 uMantissaOut = uMantissaIn;
4081
4082 /* Truncate the mantissa and set the return value. */
4083 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4084
4085 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4086 pr32Dst->s.uExponent = iExponentOut;
4087 pr32Dst->s.fSign = fSignIn;
4088
4089 /* Set status flags realted to rounding. */
4090 if (fRoundedOff)
4091 {
4092 fFsw |= X86_FSW_PE;
4093 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4094 fFsw |= X86_FSW_C1;
4095 if (!(fFcw & X86_FCW_PM))
4096 fFsw |= X86_FSW_ES | X86_FSW_B;
4097 }
4098
4099 return fFsw;
4100}
4101
4102
4103/**
4104 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4105 */
4106IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4107 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4108{
4109 uint16_t const fFcw = pFpuState->FCW;
4110 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4111 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4112 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4113 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4114 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4115 {
4116 pr32Dst->s.fSign = pr80Src->s.fSign;
4117 pr32Dst->s.uExponent = 0;
4118 pr32Dst->s.uFraction = 0;
4119 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4120 }
4121 else if (RTFLOAT80U_IS_INF(pr80Src))
4122 {
4123 pr32Dst->s.fSign = pr80Src->s.fSign;
4124 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4125 pr32Dst->s.uFraction = 0;
4126 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4127 }
4128 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4129 {
4130 /* Mapped to +/-QNaN */
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4133 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4134 }
4135 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4136 {
4137 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4138 if (fFcw & X86_FCW_IM)
4139 {
4140 pr32Dst->s.fSign = 1;
4141 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4142 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4143 fFsw |= X86_FSW_IE;
4144 }
4145 else
4146 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4147 }
4148 else if (RTFLOAT80U_IS_NAN(pr80Src))
4149 {
4150 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4151 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4152 {
4153 pr32Dst->s.fSign = pr80Src->s.fSign;
4154 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4155 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4156 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4157 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4162 }
4163 else
4164 {
4165 /* Denormal values causes both an underflow and precision exception. */
4166 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4167 if (fFcw & X86_FCW_UM)
4168 {
4169 pr32Dst->s.fSign = pr80Src->s.fSign;
4170 pr32Dst->s.uExponent = 0;
4171 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4172 {
4173 pr32Dst->s.uFraction = 1;
4174 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4175 if (!(fFcw & X86_FCW_PM))
4176 fFsw |= X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 pr32Dst->s.uFraction = 0;
4181 fFsw |= X86_FSW_UE | X86_FSW_PE;
4182 if (!(fFcw & X86_FCW_PM))
4183 fFsw |= X86_FSW_ES | X86_FSW_B;
4184 }
4185 }
4186 else
4187 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4188 }
4189 *pu16FSW = fFsw;
4190}
4191
4192
4193/**
4194 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4195 *
4196 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4197 *
4198 * @returns Updated FPU status word value.
4199 * @param fSignIn Incoming sign indicator.
4200 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4201 * @param iExponentIn Unbiased exponent.
4202 * @param fFcw The FPU control word.
4203 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4204 * @param pr64Dst Where to return the output value, if one should be
4205 * returned.
4206 *
4207 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4208 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4209 */
4210static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4211 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4212{
4213 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4214 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4215 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4216 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4217 ? fRoundingOffMask
4218 : 0;
4219 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4220
4221 /*
4222 * Deal with potential overflows/underflows first, optimizing for none.
4223 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4224 */
4225 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4226 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4227 { /* likely? */ }
4228 /*
4229 * Underflow if the exponent zero or negative. This is attempted mapped
4230 * to a subnormal number when possible, with some additional trickery ofc.
4231 */
4232 else if (iExponentOut <= 0)
4233 {
4234 bool const fIsTiny = iExponentOut < 0
4235 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4236 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4237 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4238 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4239
4240 if (iExponentOut <= 0)
4241 {
4242 uMantissaIn = iExponentOut <= -63
4243 ? uMantissaIn != 0
4244 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4245 fRoundedOff = uMantissaIn & fRoundingOffMask;
4246 if (fRoundedOff && fIsTiny)
4247 fFsw |= X86_FSW_UE;
4248 iExponentOut = 0;
4249 }
4250 }
4251 /*
4252 * Overflow if at or above max exponent value or if we will reach max
4253 * when rounding. Will return +/-zero or +/-max value depending on
4254 * whether we're rounding or not.
4255 */
4256 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4257 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4258 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4259 {
4260 fFsw |= X86_FSW_OE;
4261 if (!(fFcw & X86_FCW_OM))
4262 return fFsw | X86_FSW_ES | X86_FSW_B;
4263 fFsw |= X86_FSW_PE;
4264 if (uRoundingAdd)
4265 fFsw |= X86_FSW_C1;
4266 if (!(fFcw & X86_FCW_PM))
4267 fFsw |= X86_FSW_ES | X86_FSW_B;
4268
4269 pr64Dst->s64.fSign = fSignIn;
4270 if (uRoundingAdd)
4271 { /* Zero */
4272 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4273 pr64Dst->s64.uFraction = 0;
4274 }
4275 else
4276 { /* Max */
4277 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4278 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4279 }
4280 return fFsw;
4281 }
4282
4283 /*
4284 * Normal or subnormal number.
4285 */
4286 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4287 uint64_t uMantissaOut = uMantissaIn;
4288 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4289 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4290 || fRoundedOff != uRoundingAdd)
4291 {
4292 uMantissaOut = uMantissaIn + uRoundingAdd;
4293 if (uMantissaOut >= uMantissaIn)
4294 { /* likely */ }
4295 else
4296 {
4297 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4298 iExponentOut++;
4299 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4300 fFsw |= X86_FSW_C1;
4301 }
4302 }
4303 else
4304 uMantissaOut = uMantissaIn;
4305
4306 /* Truncate the mantissa and set the return value. */
4307 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4308
4309 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4310 pr64Dst->s64.uExponent = iExponentOut;
4311 pr64Dst->s64.fSign = fSignIn;
4312
4313 /* Set status flags realted to rounding. */
4314 if (fRoundedOff)
4315 {
4316 fFsw |= X86_FSW_PE;
4317 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4318 fFsw |= X86_FSW_C1;
4319 if (!(fFcw & X86_FCW_PM))
4320 fFsw |= X86_FSW_ES | X86_FSW_B;
4321 }
4322
4323 return fFsw;
4324}
4325
4326
4327/**
4328 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4329 */
4330IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4331 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4332{
4333 uint16_t const fFcw = pFpuState->FCW;
4334 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4335 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4336 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4337 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4338 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4339 {
4340 pr64Dst->s64.fSign = pr80Src->s.fSign;
4341 pr64Dst->s64.uExponent = 0;
4342 pr64Dst->s64.uFraction = 0;
4343 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INF(pr80Src))
4346 {
4347 pr64Dst->s64.fSign = pr80Src->s.fSign;
4348 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4349 pr64Dst->s64.uFraction = 0;
4350 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4351 }
4352 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4353 {
4354 /* Mapped to +/-QNaN */
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4357 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4358 }
4359 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4360 {
4361 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4362 if (fFcw & X86_FCW_IM)
4363 {
4364 pr64Dst->s64.fSign = 1;
4365 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4366 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4367 fFsw |= X86_FSW_IE;
4368 }
4369 else
4370 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4371 }
4372 else if (RTFLOAT80U_IS_NAN(pr80Src))
4373 {
4374 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4375 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4376 {
4377 pr64Dst->s64.fSign = pr80Src->s.fSign;
4378 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4379 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4380 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4381 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4386 }
4387 else
4388 {
4389 /* Denormal values causes both an underflow and precision exception. */
4390 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4391 if (fFcw & X86_FCW_UM)
4392 {
4393 pr64Dst->s64.fSign = pr80Src->s.fSign;
4394 pr64Dst->s64.uExponent = 0;
4395 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4396 {
4397 pr64Dst->s64.uFraction = 1;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 pr64Dst->s64.uFraction = 0;
4405 fFsw |= X86_FSW_UE | X86_FSW_PE;
4406 if (!(fFcw & X86_FCW_PM))
4407 fFsw |= X86_FSW_ES | X86_FSW_B;
4408 }
4409 }
4410 else
4411 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4412 }
4413 *pu16FSW = fFsw;
4414}
4415
4416
4417IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4418 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4419{
4420 /*
4421 * FPU status word:
4422 * - TOP is irrelevant, but we must match x86 assembly version (0).
4423 * - C1 is always cleared as we don't have any stack overflows.
4424 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4425 */
4426 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4427 *pr80Dst = *pr80Src;
4428}
4429
4430
4431/*
4432 *
4433 * Mantissa:
4434 * 63 56 48 40 32 24 16 8 0
4435 * v v v v v v v v v
4436 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4437 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4438 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4439 *
4440 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4441 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4442 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4443 * where we'll drop off all but bit 63.
4444 */
4445#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4446IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4447 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4448{ \
4449 uint16_t const fFcw = pFpuState->FCW; \
4450 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4451 bool const fSignIn = pr80Val->s.fSign; \
4452 \
4453 /* \
4454 * Deal with normal numbers first. \
4455 */ \
4456 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4457 { \
4458 uint64_t uMantissa = pr80Val->s.uMantissa; \
4459 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4460 \
4461 if ((uint32_t)iExponent <= a_cBits - 2) \
4462 { \
4463 unsigned const cShiftOff = 63 - iExponent; \
4464 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4465 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4466 ? RT_BIT_64(cShiftOff - 1) \
4467 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4468 ? fRoundingOffMask \
4469 : 0; \
4470 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4471 \
4472 uMantissa >>= cShiftOff; \
4473 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4474 uMantissa += uRounding; \
4475 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4476 { \
4477 if (fRoundedOff) \
4478 { \
4479 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4480 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4481 else if (uRounding) \
4482 fFsw |= X86_FSW_C1; \
4483 fFsw |= X86_FSW_PE; \
4484 if (!(fFcw & X86_FCW_PM)) \
4485 fFsw |= X86_FSW_ES | X86_FSW_B; \
4486 } \
4487 \
4488 if (!fSignIn) \
4489 *piDst = (a_iType)uMantissa; \
4490 else \
4491 *piDst = -(a_iType)uMantissa; \
4492 } \
4493 else \
4494 { \
4495 /* overflowed after rounding. */ \
4496 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4497 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4498 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4499 \
4500 /* Special case for the integer minimum value. */ \
4501 if (fSignIn) \
4502 { \
4503 *piDst = a_iTypeMin; \
4504 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4505 if (!(fFcw & X86_FCW_PM)) \
4506 fFsw |= X86_FSW_ES | X86_FSW_B; \
4507 } \
4508 else \
4509 { \
4510 fFsw |= X86_FSW_IE; \
4511 if (fFcw & X86_FCW_IM) \
4512 *piDst = a_iTypeMin; \
4513 else \
4514 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4515 } \
4516 } \
4517 } \
4518 /* \
4519 * Tiny sub-zero numbers. \
4520 */ \
4521 else if (iExponent < 0) \
4522 { \
4523 if (!fSignIn) \
4524 { \
4525 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4526 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4527 { \
4528 *piDst = 1; \
4529 fFsw |= X86_FSW_C1; \
4530 } \
4531 else \
4532 *piDst = 0; \
4533 } \
4534 else \
4535 { \
4536 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4537 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4538 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4539 *piDst = 0; \
4540 else \
4541 { \
4542 *piDst = -1; \
4543 fFsw |= X86_FSW_C1; \
4544 } \
4545 } \
4546 fFsw |= X86_FSW_PE; \
4547 if (!(fFcw & X86_FCW_PM)) \
4548 fFsw |= X86_FSW_ES | X86_FSW_B; \
4549 } \
4550 /* \
4551 * Special MIN case. \
4552 */ \
4553 else if ( fSignIn && iExponent == a_cBits - 1 \
4554 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4555 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4556 : uMantissa == RT_BIT_64(63))) \
4557 { \
4558 *piDst = a_iTypeMin; \
4559 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4560 { \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 } \
4566 /* \
4567 * Too large/small number outside the target integer range. \
4568 */ \
4569 else \
4570 { \
4571 fFsw |= X86_FSW_IE; \
4572 if (fFcw & X86_FCW_IM) \
4573 *piDst = a_iTypeIndefinite; \
4574 else \
4575 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4576 } \
4577 } \
4578 /* \
4579 * Map both +0 and -0 to integer zero (signless/+). \
4580 */ \
4581 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4582 *piDst = 0; \
4583 /* \
4584 * Denormals are just really tiny sub-zero numbers that are either rounded \
4585 * to zero, 1 or -1 depending on sign and rounding control. \
4586 */ \
4587 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4588 { \
4589 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4590 *piDst = 0; \
4591 else \
4592 { \
4593 *piDst = fSignIn ? -1 : 1; \
4594 fFsw |= X86_FSW_C1; \
4595 } \
4596 fFsw |= X86_FSW_PE; \
4597 if (!(fFcw & X86_FCW_PM)) \
4598 fFsw |= X86_FSW_ES | X86_FSW_B; \
4599 } \
4600 /* \
4601 * All other special values are considered invalid arguments and result \
4602 * in an IE exception and indefinite value if masked. \
4603 */ \
4604 else \
4605 { \
4606 fFsw |= X86_FSW_IE; \
4607 if (fFcw & X86_FCW_IM) \
4608 *piDst = a_iTypeIndefinite; \
4609 else \
4610 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4611 } \
4612 *pu16FSW = fFsw; \
4613}
4614EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4615EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4616EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4617
4618#endif /*IEM_WITHOUT_ASSEMBLY */
4619
4620
4621/*
4622 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4623 *
4624 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4625 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4626 * thus the @a a_cBitsIn.
4627 */
4628#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4629IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4649 uMantissa >>= cShiftOff; \
4650 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4651 if (!fSignIn) \
4652 *piDst = (a_iType)uMantissa; \
4653 else \
4654 *piDst = -(a_iType)uMantissa; \
4655 \
4656 if (fRoundedOff) \
4657 { \
4658 fFsw |= X86_FSW_PE; \
4659 if (!(fFcw & X86_FCW_PM)) \
4660 fFsw |= X86_FSW_ES | X86_FSW_B; \
4661 } \
4662 } \
4663 /* \
4664 * Tiny sub-zero numbers. \
4665 */ \
4666 else if (iExponent < 0) \
4667 { \
4668 *piDst = 0; \
4669 fFsw |= X86_FSW_PE; \
4670 if (!(fFcw & X86_FCW_PM)) \
4671 fFsw |= X86_FSW_ES | X86_FSW_B; \
4672 } \
4673 /* \
4674 * Special MIN case. \
4675 */ \
4676 else if ( fSignIn && iExponent == a_cBits - 1 \
4677 && (a_cBits < 64 \
4678 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4679 : uMantissa == RT_BIT_64(63)) ) \
4680 { \
4681 *piDst = a_iTypeMin; \
4682 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4683 { \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 } \
4689 /* \
4690 * Figure this weirdness. \
4691 */ \
4692 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4693 { \
4694 *piDst = 0; \
4695 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4696 { \
4697 fFsw |= X86_FSW_PE; \
4698 if (!(fFcw & X86_FCW_PM)) \
4699 fFsw |= X86_FSW_ES | X86_FSW_B; \
4700 } \
4701 } \
4702 /* \
4703 * Too large/small number outside the target integer range. \
4704 */ \
4705 else \
4706 { \
4707 fFsw |= X86_FSW_IE; \
4708 if (fFcw & X86_FCW_IM) \
4709 *piDst = a_iTypeIndefinite; \
4710 else \
4711 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4712 } \
4713 } \
4714 /* \
4715 * Map both +0 and -0 to integer zero (signless/+). \
4716 */ \
4717 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4718 *piDst = 0; \
4719 /* \
4720 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4721 */ \
4722 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4723 { \
4724 *piDst = 0; \
4725 fFsw |= X86_FSW_PE; \
4726 if (!(fFcw & X86_FCW_PM)) \
4727 fFsw |= X86_FSW_ES | X86_FSW_B; \
4728 } \
4729 /* \
4730 * All other special values are considered invalid arguments and result \
4731 * in an IE exception and indefinite value if masked. \
4732 */ \
4733 else \
4734 { \
4735 fFsw |= X86_FSW_IE; \
4736 if (fFcw & X86_FCW_IM) \
4737 *piDst = a_iTypeIndefinite; \
4738 else \
4739 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4740 } \
4741 *pu16FSW = fFsw; \
4742}
4743#if defined(IEM_WITHOUT_ASSEMBLY)
4744EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4746EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4747#endif
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4749EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4750
4751
4752#if defined(IEM_WITHOUT_ASSEMBLY)
4753
4754IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4755 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4756{
4757 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4758 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4759 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4760 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4761 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4762
4763 uint16_t const fFcw = pFpuState->FCW;
4764 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4765 bool const fSignIn = pr80Src->s.fSign;
4766
4767 /*
4768 * Deal with normal numbers first.
4769 */
4770 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4771 {
4772 uint64_t uMantissa = pr80Src->s.uMantissa;
4773 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4774 if ( (uint32_t)iExponent <= 58
4775 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4776 {
4777 unsigned const cShiftOff = 63 - iExponent;
4778 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4779 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4780 ? RT_BIT_64(cShiftOff - 1)
4781 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4782 ? fRoundingOffMask
4783 : 0;
4784 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4785
4786 uMantissa >>= cShiftOff;
4787 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4788 uMantissa += uRounding;
4789 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4790 {
4791 if (fRoundedOff)
4792 {
4793 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4794 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4795 else if (uRounding)
4796 fFsw |= X86_FSW_C1;
4797 fFsw |= X86_FSW_PE;
4798 if (!(fFcw & X86_FCW_PM))
4799 fFsw |= X86_FSW_ES | X86_FSW_B;
4800 }
4801
4802 pd80Dst->s.fSign = fSignIn;
4803 pd80Dst->s.uPad = 0;
4804 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4805 {
4806 unsigned const uDigits = uMantissa % 100;
4807 uMantissa /= 100;
4808 uint8_t const bLo = uDigits % 10;
4809 uint8_t const bHi = uDigits / 10;
4810 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4811 }
4812 }
4813 else
4814 {
4815 /* overflowed after rounding. */
4816 fFsw |= X86_FSW_IE;
4817 if (fFcw & X86_FCW_IM)
4818 *pd80Dst = s_d80Indefinite;
4819 else
4820 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4821 }
4822 }
4823 /*
4824 * Tiny sub-zero numbers.
4825 */
4826 else if (iExponent < 0)
4827 {
4828 if (!fSignIn)
4829 {
4830 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4831 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4832 {
4833 *pd80Dst = s_ad80One[fSignIn];
4834 fFsw |= X86_FSW_C1;
4835 }
4836 else
4837 *pd80Dst = s_ad80Zeros[fSignIn];
4838 }
4839 else
4840 {
4841 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4842 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4843 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4844 *pd80Dst = s_ad80Zeros[fSignIn];
4845 else
4846 {
4847 *pd80Dst = s_ad80One[fSignIn];
4848 fFsw |= X86_FSW_C1;
4849 }
4850 }
4851 fFsw |= X86_FSW_PE;
4852 if (!(fFcw & X86_FCW_PM))
4853 fFsw |= X86_FSW_ES | X86_FSW_B;
4854 }
4855 /*
4856 * Too large/small number outside the target integer range.
4857 */
4858 else
4859 {
4860 fFsw |= X86_FSW_IE;
4861 if (fFcw & X86_FCW_IM)
4862 *pd80Dst = s_d80Indefinite;
4863 else
4864 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4865 }
4866 }
4867 /*
4868 * Map both +0 and -0 to integer zero (signless/+).
4869 */
4870 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4871 *pd80Dst = s_ad80Zeros[fSignIn];
4872 /*
4873 * Denormals are just really tiny sub-zero numbers that are either rounded
4874 * to zero, 1 or -1 depending on sign and rounding control.
4875 */
4876 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4877 {
4878 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4879 *pd80Dst = s_ad80Zeros[fSignIn];
4880 else
4881 {
4882 *pd80Dst = s_ad80One[fSignIn];
4883 fFsw |= X86_FSW_C1;
4884 }
4885 fFsw |= X86_FSW_PE;
4886 if (!(fFcw & X86_FCW_PM))
4887 fFsw |= X86_FSW_ES | X86_FSW_B;
4888 }
4889 /*
4890 * All other special values are considered invalid arguments and result
4891 * in an IE exception and indefinite value if masked.
4892 */
4893 else
4894 {
4895 fFsw |= X86_FSW_IE;
4896 if (fFcw & X86_FCW_IM)
4897 *pd80Dst = s_d80Indefinite;
4898 else
4899 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4900 }
4901 *pu16FSW = fFsw;
4902}
4903
4904
4905/*********************************************************************************************************************************
4906* FPU Helpers *
4907*********************************************************************************************************************************/
4908AssertCompileSize(RTFLOAT128U, 16);
4909AssertCompileSize(RTFLOAT80U, 10);
4910AssertCompileSize(RTFLOAT64U, 8);
4911AssertCompileSize(RTFLOAT32U, 4);
4912
4913/**
4914 * Normalizes a possible pseudo-normal value.
4915 *
4916 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4917 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4918 * i.e. changing uExponent from 0 to 1.
4919 *
4920 * This macro will declare a RTFLOAT80U with the name given by
4921 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4922 * a normalization was performed.
4923 *
4924 * @note This must be applied before calling SoftFloat with a value that couldbe
4925 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4926 * correctly.
4927 */
4928#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4929 RTFLOAT80U a_r80ValNormalized; \
4930 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4931 { \
4932 a_r80ValNormalized = *a_pr80Val; \
4933 a_r80ValNormalized.s.uExponent = 1; \
4934 a_pr80Val = &a_r80ValNormalized; \
4935 } else do {} while (0)
4936
4937#ifdef IEM_WITH_FLOAT128_FOR_FPU
4938
4939DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4940{
4941 int fNew;
4942 switch (fFcw & X86_FCW_RC_MASK)
4943 {
4944 default:
4945 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4946 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4947 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4948 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4949 }
4950 int fOld = fegetround();
4951 fesetround(fNew);
4952 return fOld;
4953}
4954
4955
4956DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4957{
4958 fesetround(fOld);
4959}
4960
4961DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4962{
4963 RT_NOREF(fFcw);
4964 RTFLOAT128U Tmp;
4965 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4966 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4967 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4968 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4969 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4970 {
4971 Assert(Tmp.s.uExponent == 0);
4972 Tmp.s2.uSignAndExponent++;
4973 }
4974 return *(_Float128 *)&Tmp;
4975}
4976
4977
4978DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4979{
4980 RT_NOREF(fFcw);
4981 RTFLOAT128U Tmp;
4982 *(_Float128 *)&Tmp = rd128ValSrc;
4983 ASMCompilerBarrier();
4984 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4985 {
4986 pr80Dst->s.fSign = Tmp.s64.fSign;
4987 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4988 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4989 | Tmp.s64.uFractionLo >> (64 - 15);
4990
4991 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4992 unsigned const cShiftOff = 64 - 15;
4993 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4994 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4995 if (uRoundedOff)
4996 {
4997 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4998 ? RT_BIT_64(cShiftOff - 1)
4999 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5000 ? fRoundingOffMask
5001 : 0;
5002 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5003 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5004 || uRoundedOff != uRoundingAdd)
5005 {
5006 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5007 {
5008 uFraction += 1;
5009 if (!(uFraction & RT_BIT_64(63)))
5010 { /* likely */ }
5011 else
5012 {
5013 uFraction >>= 1;
5014 pr80Dst->s.uExponent++;
5015 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5016 return fFsw;
5017 }
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5026 }
5027 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5028 {
5029 pr80Dst->s.fSign = Tmp.s64.fSign;
5030 pr80Dst->s.uExponent = 0;
5031 pr80Dst->s.uMantissa = 0;
5032 }
5033 else if (RTFLOAT128U_IS_INF(&Tmp))
5034 {
5035 pr80Dst->s.fSign = Tmp.s64.fSign;
5036 pr80Dst->s.uExponent = 0;
5037 pr80Dst->s.uMantissa = 0;
5038 }
5039 return fFsw;
5040}
5041
5042
5043#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5044
5045/** Initializer for the SoftFloat state structure. */
5046# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5047 { \
5048 softfloat_tininess_afterRounding, \
5049 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5051 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5052 : (uint8_t)softfloat_round_minMag, \
5053 0, \
5054 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5055 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5056 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5057 }
5058
5059/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5060# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5061 ( (a_fFsw) \
5062 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5063 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5064 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5065 ? X86_FSW_ES | X86_FSW_B : 0) )
5066
5067
5068DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5069{
5070 RT_NOREF(fFcw);
5071 Assert(cBits > 64);
5072# if 0 /* rounding does not seem to help */
5073 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5074 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5075 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5076 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5077 {
5078 uint64_t uOld = r128.v[0];
5079 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5080 if (r128.v[0] < uOld)
5081 r128.v[1] += 1;
5082 }
5083# else
5084 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5085# endif
5086 return r128;
5087}
5088
5089
5090DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5091{
5092 RT_NOREF(fFcw);
5093 Assert(cBits > 64);
5094# if 0 /* rounding does not seem to help, not even on constants */
5095 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5096 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5097 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5098 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5099 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5100 {
5101 uint64_t uOld = r128.v[0];
5102 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5103 if (r128.v[0] < uOld)
5104 r128.v[1] += 1;
5105 }
5106 return r128;
5107# else
5108 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5109 return r128;
5110# endif
5111}
5112
5113
5114# if 0 /* unused */
5115DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5116{
5117 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5118 return r128;
5119}
5120# endif
5121
5122
5123/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5124DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5125{
5126 extFloat80_t Tmp;
5127 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5128 Tmp.signif = pr80Val->s2.uMantissa;
5129 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5130 return extF80_to_f128(Tmp, &Ignored);
5131}
5132
5133
5134/**
5135 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5136 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5137 *
5138 * This is only a structure format conversion, nothing else.
5139 */
5140DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5141{
5142 extFloat80_t Tmp;
5143 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5144 Tmp.signif = pr80Val->s2.uMantissa;
5145 return Tmp;
5146}
5147
5148
5149/**
5150 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5151 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5156{
5157 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5158 pr80Dst->s2.uMantissa = r80XSrc.signif;
5159 return pr80Dst;
5160}
5161
5162
5163DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5164{
5165 RT_NOREF(fFcw);
5166 RTFLOAT128U Tmp;
5167 *(float128_t *)&Tmp = r128Src;
5168 ASMCompilerBarrier();
5169
5170 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5171 {
5172 pr80Dst->s.fSign = Tmp.s64.fSign;
5173 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5174 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5175 | Tmp.s64.uFractionLo >> (64 - 15);
5176
5177 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5178 unsigned const cShiftOff = 64 - 15;
5179 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5180 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5181 if (uRoundedOff)
5182 {
5183 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5184 ? RT_BIT_64(cShiftOff - 1)
5185 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5186 ? fRoundingOffMask
5187 : 0;
5188 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5189 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5190 || uRoundedOff != uRoundingAdd)
5191 {
5192 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5193 {
5194 uFraction += 1;
5195 if (!(uFraction & RT_BIT_64(63)))
5196 { /* likely */ }
5197 else
5198 {
5199 uFraction >>= 1;
5200 pr80Dst->s.uExponent++;
5201 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5202 return fFsw;
5203 }
5204 fFsw |= X86_FSW_C1;
5205 }
5206 }
5207 fFsw |= X86_FSW_PE;
5208 if (!(fFcw & X86_FCW_PM))
5209 fFsw |= X86_FSW_ES | X86_FSW_B;
5210 }
5211
5212 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5213 }
5214 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5215 {
5216 pr80Dst->s.fSign = Tmp.s64.fSign;
5217 pr80Dst->s.uExponent = 0;
5218 pr80Dst->s.uMantissa = 0;
5219 }
5220 else if (RTFLOAT128U_IS_INF(&Tmp))
5221 {
5222 pr80Dst->s.fSign = Tmp.s64.fSign;
5223 pr80Dst->s.uExponent = 0;
5224 pr80Dst->s.uMantissa = 0;
5225 }
5226 return fFsw;
5227}
5228
5229
5230/**
5231 * Helper for transfering exception and C1 to FSW and setting the result value
5232 * accordingly.
5233 *
5234 * @returns Updated FSW.
5235 * @param pSoftState The SoftFloat state following the operation.
5236 * @param r80XResult The result of the SoftFloat operation.
5237 * @param pr80Result Where to store the result for IEM.
5238 * @param fFcw The FPU control word.
5239 * @param fFsw The FSW before the operation, with necessary bits
5240 * cleared and such.
5241 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5242 * raised.
5243 */
5244DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5245 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5246 PCRTFLOAT80U pr80XcptResult)
5247{
5248 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5249 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5250 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5251 fFsw |= X86_FSW_ES | X86_FSW_B;
5252
5253 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5254 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5255 else
5256 {
5257 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5258 *pr80Result = *pr80XcptResult;
5259 }
5260 return fFsw;
5261}
5262
5263
5264/**
5265 * Helper doing polynomial evaluation using Horner's method.
5266 *
5267 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5268 */
5269float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5270 unsigned cPrecision, softfloat_state_t *pSoftState)
5271{
5272 Assert(cHornerConsts > 1);
5273 size_t i = cHornerConsts - 1;
5274 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5275 while (i-- > 0)
5276 {
5277 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5278 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5279 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5280 }
5281 return r128Result;
5282}
5283
5284#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5285
5286
5287/**
5288 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5289 * mantissa, exponent and sign.
5290 *
5291 * @returns Updated FSW.
5292 * @param pr80Dst Where to return the composed value.
5293 * @param fSign The sign.
5294 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5295 * ignored and should be zero. This will probably be
5296 * modified during normalization and rounding.
5297 * @param iExponent Unbiased exponent.
5298 * @param fFcw The FPU control word.
5299 * @param fFsw The FPU status word.
5300 */
5301static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5302 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5303{
5304 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5305
5306 iExponent += RTFLOAT80U_EXP_BIAS;
5307
5308 /* Do normalization if necessary and possible. */
5309 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5310 {
5311 int cShift = 192 - RTUInt256BitCount(puMantissa);
5312 if (iExponent > cShift)
5313 iExponent -= cShift;
5314 else
5315 {
5316 if (fFcw & X86_FCW_UM)
5317 {
5318 if (iExponent > 0)
5319 cShift = --iExponent;
5320 else
5321 cShift = 0;
5322 }
5323 iExponent -= cShift;
5324 }
5325 RTUInt256AssignShiftLeft(puMantissa, cShift);
5326 }
5327
5328 /* Do rounding. */
5329 uint64_t uMantissa = puMantissa->QWords.qw2;
5330 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5331 {
5332 bool fAdd;
5333 switch (fFcw & X86_FCW_RC_MASK)
5334 {
5335 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5336 case X86_FCW_RC_NEAREST:
5337 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5338 {
5339 if ( (uMantissa & 1)
5340 || puMantissa->QWords.qw0 != 0
5341 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5342 {
5343 fAdd = true;
5344 break;
5345 }
5346 uMantissa &= ~(uint64_t)1;
5347 }
5348 fAdd = false;
5349 break;
5350 case X86_FCW_RC_ZERO:
5351 fAdd = false;
5352 break;
5353 case X86_FCW_RC_UP:
5354 fAdd = !fSign;
5355 break;
5356 case X86_FCW_RC_DOWN:
5357 fAdd = fSign;
5358 break;
5359 }
5360 if (fAdd)
5361 {
5362 uint64_t const uTmp = uMantissa;
5363 uMantissa = uTmp + 1;
5364 if (uMantissa < uTmp)
5365 {
5366 uMantissa >>= 1;
5367 uMantissa |= RT_BIT_64(63);
5368 iExponent++;
5369 }
5370 fFsw |= X86_FSW_C1;
5371 }
5372 fFsw |= X86_FSW_PE;
5373 if (!(fFcw & X86_FCW_PM))
5374 fFsw |= X86_FSW_ES | X86_FSW_B;
5375 }
5376
5377 /* Check for underflow (denormals). */
5378 if (iExponent <= 0)
5379 {
5380 if (fFcw & X86_FCW_UM)
5381 {
5382 if (uMantissa & RT_BIT_64(63))
5383 uMantissa >>= 1;
5384 iExponent = 0;
5385 }
5386 else
5387 {
5388 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391 fFsw |= X86_FSW_UE;
5392 }
5393 /* Check for overflow */
5394 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5395 {
5396 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5397 }
5398
5399 /* Compose the result. */
5400 pr80Dst->s.uMantissa = uMantissa;
5401 pr80Dst->s.uExponent = iExponent;
5402 pr80Dst->s.fSign = fSign;
5403 return fFsw;
5404}
5405
5406
5407/**
5408 * See also iemAImpl_fld_r80_from_r32
5409 */
5410static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5411{
5412 uint16_t fFsw = 0;
5413 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5414 {
5415 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5416 pr80Dst->sj64.fInteger = 1;
5417 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5418 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5419 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5420 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5421 }
5422 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5423 {
5424 pr80Dst->s.fSign = pr32Val->s.fSign;
5425 pr80Dst->s.uExponent = 0;
5426 pr80Dst->s.uMantissa = 0;
5427 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5428 }
5429 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5430 {
5431 /* Subnormal -> normalized + X86_FSW_DE return. */
5432 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5433 pr80Dst->sj64.fInteger = 1;
5434 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5435 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5436 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5437 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5438 fFsw = X86_FSW_DE;
5439 }
5440 else if (RTFLOAT32U_IS_INF(pr32Val))
5441 {
5442 pr80Dst->s.fSign = pr32Val->s.fSign;
5443 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5444 pr80Dst->s.uMantissa = RT_BIT_64(63);
5445 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5446 }
5447 else
5448 {
5449 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5450 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5451 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5452 pr80Dst->sj64.fInteger = 1;
5453 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5454 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5455 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5456 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5457 }
5458 return fFsw;
5459}
5460
5461
5462/**
5463 * See also iemAImpl_fld_r80_from_r64
5464 */
5465static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5466{
5467 uint16_t fFsw = 0;
5468 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5469 {
5470 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5471 pr80Dst->sj64.fInteger = 1;
5472 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5473 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5474 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5475 }
5476 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5477 {
5478 pr80Dst->s.fSign = pr64Val->s.fSign;
5479 pr80Dst->s.uExponent = 0;
5480 pr80Dst->s.uMantissa = 0;
5481 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5482 }
5483 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5484 {
5485 /* Subnormal values gets normalized. */
5486 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5487 pr80Dst->sj64.fInteger = 1;
5488 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5489 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5490 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5491 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5492 fFsw = X86_FSW_DE;
5493 }
5494 else if (RTFLOAT64U_IS_INF(pr64Val))
5495 {
5496 pr80Dst->s.fSign = pr64Val->s.fSign;
5497 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5498 pr80Dst->s.uMantissa = RT_BIT_64(63);
5499 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5500 }
5501 else
5502 {
5503 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5504 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5505 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5506 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5507 pr80Dst->sj64.fInteger = 1;
5508 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5509 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5510 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5511 }
5512 return fFsw;
5513}
5514
5515
5516/**
5517 * See also EMIT_FILD.
5518 */
5519#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5520static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5521{ \
5522 if (iVal == 0) \
5523 { \
5524 pr80Dst->s.fSign = 0; \
5525 pr80Dst->s.uExponent = 0; \
5526 pr80Dst->s.uMantissa = 0; \
5527 } \
5528 else \
5529 { \
5530 if (iVal > 0) \
5531 pr80Dst->s.fSign = 0; \
5532 else \
5533 { \
5534 pr80Dst->s.fSign = 1; \
5535 iVal = -iVal; \
5536 } \
5537 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5538 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5539 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5540 } \
5541 return pr80Dst; \
5542}
5543EMIT_CONVERT_IXX_TO_R80(16)
5544EMIT_CONVERT_IXX_TO_R80(32)
5545//EMIT_CONVERT_IXX_TO_R80(64)
5546
5547/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5548#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5549IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5550{ \
5551 RTFLOAT80U r80Val2; \
5552 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5553 Assert(!fFsw || fFsw == X86_FSW_DE); \
5554 if (fFsw) \
5555 { \
5556 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5557 fFsw = 0; \
5558 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5559 { \
5560 pFpuRes->r80Result = *pr80Val1; \
5561 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5562 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5563 return; \
5564 } \
5565 } \
5566 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5567 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5568}
5569
5570/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5571#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5572IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5573{ \
5574 RTFLOAT80U r80Val2; \
5575 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5576 Assert(!fFsw || fFsw == X86_FSW_DE); \
5577 if (fFsw) \
5578 { \
5579 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5580 fFsw = 0; \
5581 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5582 { \
5583 pFpuRes->r80Result = *pr80Val1; \
5584 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5585 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5586 return; \
5587 } \
5588 } \
5589 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5590 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5591}
5592
5593/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5594#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5595IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5596{ \
5597 RTFLOAT80U r80Val2; \
5598 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5599 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5600}
5601
5602/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5603#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5604IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5605{ \
5606 RTFLOAT80U r80Val2; \
5607 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5608 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5609}
5610
5611
5612
5613/*********************************************************************************************************************************
5614* x86 FPU Division Operations *
5615*********************************************************************************************************************************/
5616
5617/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5618static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5619 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5620{
5621 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5622 {
5623 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5624 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5625 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5626 }
5627 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5628 { /* Div by zero. */
5629 if (fFcw & X86_FCW_ZM)
5630 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5631 else
5632 {
5633 *pr80Result = *pr80Val1Org;
5634 fFsw |= X86_FSW_ES | X86_FSW_B;
5635 }
5636 fFsw |= X86_FSW_ZE;
5637 }
5638 else
5639 { /* Invalid operand */
5640 if (fFcw & X86_FCW_IM)
5641 *pr80Result = g_r80Indefinite;
5642 else
5643 {
5644 *pr80Result = *pr80Val1Org;
5645 fFsw |= X86_FSW_ES | X86_FSW_B;
5646 }
5647 fFsw |= X86_FSW_IE;
5648 }
5649 return fFsw;
5650}
5651
5652
5653IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5655{
5656 uint16_t const fFcw = pFpuState->FCW;
5657 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5658
5659 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5660 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5661 {
5662 if (fFcw & X86_FCW_IM)
5663 pFpuRes->r80Result = g_r80Indefinite;
5664 else
5665 {
5666 pFpuRes->r80Result = *pr80Val1;
5667 fFsw |= X86_FSW_ES | X86_FSW_B;
5668 }
5669 fFsw |= X86_FSW_IE;
5670 }
5671 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5672 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5673 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5674 {
5675 if (fFcw & X86_FCW_DM)
5676 {
5677 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5679 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5680 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5681 }
5682 else
5683 {
5684 pFpuRes->r80Result = *pr80Val1;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_DE;
5688 }
5689 /* SoftFloat can handle the rest: */
5690 else
5691 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5692
5693 pFpuRes->FSW = fFsw;
5694}
5695
5696
5697EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5699EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5700EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5701
5702
5703IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5704 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5705{
5706 uint16_t const fFcw = pFpuState->FCW;
5707 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5708
5709 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5710 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5711 {
5712 if (fFcw & X86_FCW_IM)
5713 pFpuRes->r80Result = g_r80Indefinite;
5714 else
5715 {
5716 pFpuRes->r80Result = *pr80Val1;
5717 fFsw |= X86_FSW_ES | X86_FSW_B;
5718 }
5719 fFsw |= X86_FSW_IE;
5720 }
5721 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5722 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5723 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5724 {
5725 if (fFcw & X86_FCW_DM)
5726 {
5727 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5729 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5730 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5731 }
5732 else
5733 {
5734 pFpuRes->r80Result = *pr80Val1;
5735 fFsw |= X86_FSW_ES | X86_FSW_B;
5736 }
5737 fFsw |= X86_FSW_DE;
5738 }
5739 /* SoftFloat can handle the rest: */
5740 else
5741 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5742
5743 pFpuRes->FSW = fFsw;
5744}
5745
5746
5747EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5749EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5750EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5751
5752
5753/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5754static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5755 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5756{
5757 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5758 {
5759 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5760 uint16_t fCxFlags = 0;
5761 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5762 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5763 &fCxFlags, &SoftState);
5764 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5765 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5766 if ( !(fFsw & X86_FSW_IE)
5767 && !RTFLOAT80U_IS_NAN(pr80Result)
5768 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5769 {
5770 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5771 fFsw |= fCxFlags & X86_FSW_C_MASK;
5772 }
5773 return fFsw;
5774 }
5775
5776 /* Invalid operand */
5777 if (fFcw & X86_FCW_IM)
5778 *pr80Result = g_r80Indefinite;
5779 else
5780 {
5781 *pr80Result = *pr80Val1Org;
5782 fFsw |= X86_FSW_ES | X86_FSW_B;
5783 }
5784 return fFsw | X86_FSW_IE;
5785}
5786
5787
5788static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5789 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5790{
5791 uint16_t const fFcw = pFpuState->FCW;
5792 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5793
5794 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5795 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5796 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5797 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5798 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5799 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5800 {
5801 if (fFcw & X86_FCW_IM)
5802 pFpuRes->r80Result = g_r80Indefinite;
5803 else
5804 {
5805 pFpuRes->r80Result = *pr80Val1;
5806 fFsw |= X86_FSW_ES | X86_FSW_B;
5807 }
5808 fFsw |= X86_FSW_IE;
5809 }
5810 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5811 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5812 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5813 {
5814 if (fFcw & X86_FCW_DM)
5815 {
5816 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5818 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5819 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5820 pr80Val1Org, fLegacyInstr);
5821 }
5822 else
5823 {
5824 pFpuRes->r80Result = *pr80Val1;
5825 fFsw |= X86_FSW_ES | X86_FSW_B;
5826 }
5827 fFsw |= X86_FSW_DE;
5828 }
5829 /* SoftFloat can handle the rest: */
5830 else
5831 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5832 pr80Val1, fLegacyInstr);
5833
5834 pFpuRes->FSW = fFsw;
5835}
5836
5837
5838IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5840{
5841 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5842}
5843
5844
5845IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5846 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5847{
5848 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5849}
5850
5851
5852/*********************************************************************************************************************************
5853* x87 FPU Multiplication Operations *
5854*********************************************************************************************************************************/
5855
5856/** Worker for iemAImpl_fmul_r80_by_r80. */
5857static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5858 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5859{
5860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5861 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5862 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5863}
5864
5865
5866IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5867 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5868{
5869 uint16_t const fFcw = pFpuState->FCW;
5870 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5871
5872 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5873 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5874 {
5875 if (fFcw & X86_FCW_IM)
5876 pFpuRes->r80Result = g_r80Indefinite;
5877 else
5878 {
5879 pFpuRes->r80Result = *pr80Val1;
5880 fFsw |= X86_FSW_ES | X86_FSW_B;
5881 }
5882 fFsw |= X86_FSW_IE;
5883 }
5884 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5885 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5886 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5887 {
5888 if (fFcw & X86_FCW_DM)
5889 {
5890 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5892 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5893 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5894 }
5895 else
5896 {
5897 pFpuRes->r80Result = *pr80Val1;
5898 fFsw |= X86_FSW_ES | X86_FSW_B;
5899 }
5900 fFsw |= X86_FSW_DE;
5901 }
5902 /* SoftFloat can handle the rest: */
5903 else
5904 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5905
5906 pFpuRes->FSW = fFsw;
5907}
5908
5909
5910EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5912EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5913EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5914
5915
5916/*********************************************************************************************************************************
5917* x87 FPU Addition *
5918*********************************************************************************************************************************/
5919
5920/** Worker for iemAImpl_fadd_r80_by_r80. */
5921static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5922 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5923{
5924 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5925 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5926 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5927}
5928
5929
5930IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5932{
5933 uint16_t const fFcw = pFpuState->FCW;
5934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5935
5936 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5937 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5938 {
5939 if (fFcw & X86_FCW_IM)
5940 pFpuRes->r80Result = g_r80Indefinite;
5941 else
5942 {
5943 pFpuRes->r80Result = *pr80Val1;
5944 fFsw |= X86_FSW_ES | X86_FSW_B;
5945 }
5946 fFsw |= X86_FSW_IE;
5947 }
5948 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5949 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5950 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5951 {
5952 if (fFcw & X86_FCW_DM)
5953 {
5954 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5956 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5957 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5958 }
5959 else
5960 {
5961 pFpuRes->r80Result = *pr80Val1;
5962 fFsw |= X86_FSW_ES | X86_FSW_B;
5963 }
5964 fFsw |= X86_FSW_DE;
5965 }
5966 /* SoftFloat can handle the rest: */
5967 else
5968 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5969
5970 pFpuRes->FSW = fFsw;
5971}
5972
5973
5974EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5976EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5977EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5978
5979
5980/*********************************************************************************************************************************
5981* x87 FPU Subtraction *
5982*********************************************************************************************************************************/
5983
5984/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5985static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5986 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5987{
5988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5989 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5990 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5991}
5992
5993
5994IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5995 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5996{
5997 uint16_t const fFcw = pFpuState->FCW;
5998 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5999
6000 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6001 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6002 {
6003 if (fFcw & X86_FCW_IM)
6004 pFpuRes->r80Result = g_r80Indefinite;
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_IE;
6011 }
6012 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6013 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6014 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6015 {
6016 if (fFcw & X86_FCW_DM)
6017 {
6018 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6021 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6022 }
6023 else
6024 {
6025 pFpuRes->r80Result = *pr80Val1;
6026 fFsw |= X86_FSW_ES | X86_FSW_B;
6027 }
6028 fFsw |= X86_FSW_DE;
6029 }
6030 /* SoftFloat can handle the rest: */
6031 else
6032 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6033
6034 pFpuRes->FSW = fFsw;
6035}
6036
6037
6038EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6040EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6041EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6042
6043
6044/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6045IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6046 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6047{
6048 uint16_t const fFcw = pFpuState->FCW;
6049 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6050
6051 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6052 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6053 {
6054 if (fFcw & X86_FCW_IM)
6055 pFpuRes->r80Result = g_r80Indefinite;
6056 else
6057 {
6058 pFpuRes->r80Result = *pr80Val1;
6059 fFsw |= X86_FSW_ES | X86_FSW_B;
6060 }
6061 fFsw |= X86_FSW_IE;
6062 }
6063 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6064 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6065 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6066 {
6067 if (fFcw & X86_FCW_DM)
6068 {
6069 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6071 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6072 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6073 }
6074 else
6075 {
6076 pFpuRes->r80Result = *pr80Val1;
6077 fFsw |= X86_FSW_ES | X86_FSW_B;
6078 }
6079 fFsw |= X86_FSW_DE;
6080 }
6081 /* SoftFloat can handle the rest: */
6082 else
6083 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6084
6085 pFpuRes->FSW = fFsw;
6086}
6087
6088
6089EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6091EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6092EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6093
6094
6095/*********************************************************************************************************************************
6096* x87 FPU Trigometric Operations *
6097*********************************************************************************************************************************/
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6104 AssertReleaseFailed();
6105}
6106
6107#endif /* IEM_WITHOUT_ASSEMBLY */
6108
6109IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6110 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6111{
6112 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6113}
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119}
6120
6121
6122#if defined(IEM_WITHOUT_ASSEMBLY)
6123IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6124{
6125 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6126 AssertReleaseFailed();
6127}
6128#endif /* IEM_WITHOUT_ASSEMBLY */
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6131{
6132 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6133}
6134
6135IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6136{
6137 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6138}
6139
6140
6141#ifdef IEM_WITHOUT_ASSEMBLY
6142IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6143{
6144 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6145 AssertReleaseFailed();
6146}
6147#endif /* IEM_WITHOUT_ASSEMBLY */
6148
6149IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6150{
6151 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6152}
6153
6154IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6155{
6156 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6157}
6158
6159#ifdef IEM_WITHOUT_ASSEMBLY
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6163 AssertReleaseFailed();
6164}
6165#endif /* IEM_WITHOUT_ASSEMBLY */
6166
6167IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6168{
6169 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6170}
6171
6172IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6173{
6174 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6175}
6176
6177
6178#ifdef IEM_WITHOUT_ASSEMBLY
6179IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6180{
6181 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6182 AssertReleaseFailed();
6183}
6184#endif /* IEM_WITHOUT_ASSEMBLY */
6185
6186IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6187{
6188 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6189}
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6194}
6195
6196#ifdef IEM_WITHOUT_ASSEMBLY
6197
6198
6199/*********************************************************************************************************************************
6200* x87 FPU Compare and Testing Operations *
6201*********************************************************************************************************************************/
6202
6203IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6204{
6205 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6206
6207 if (RTFLOAT80U_IS_ZERO(pr80Val))
6208 fFsw |= X86_FSW_C3;
6209 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6210 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6211 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6212 {
6213 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6214 if (!(pFpuState->FCW & X86_FCW_DM))
6215 fFsw |= X86_FSW_ES | X86_FSW_B;
6216 }
6217 else
6218 {
6219 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6220 if (!(pFpuState->FCW & X86_FCW_IM))
6221 fFsw |= X86_FSW_ES | X86_FSW_B;
6222 }
6223
6224 *pu16Fsw = fFsw;
6225}
6226
6227
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6229{
6230 RT_NOREF(pFpuState);
6231 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6232
6233 /* C1 = sign bit (always, even if empty Intel says). */
6234 if (pr80Val->s.fSign)
6235 fFsw |= X86_FSW_C1;
6236
6237 /* Classify the value in C0, C2, C3. */
6238 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6239 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6240 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6241 fFsw |= X86_FSW_C2;
6242 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6243 fFsw |= X86_FSW_C3;
6244 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6245 fFsw |= X86_FSW_C0;
6246 else if (RTFLOAT80U_IS_INF(pr80Val))
6247 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6248 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6249 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6250 /* whatever else: 0 */
6251
6252 *pu16Fsw = fFsw;
6253}
6254
6255
6256/**
6257 * Worker for fcom, fucom, and friends.
6258 */
6259static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6260 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6261{
6262 /*
6263 * Unpack the values.
6264 */
6265 bool const fSign1 = pr80Val1->s.fSign;
6266 int32_t iExponent1 = pr80Val1->s.uExponent;
6267 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6268
6269 bool const fSign2 = pr80Val2->s.fSign;
6270 int32_t iExponent2 = pr80Val2->s.uExponent;
6271 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6272
6273 /*
6274 * Check for invalid inputs.
6275 */
6276 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6277 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6278 {
6279 if (!(fFcw & X86_FCW_IM))
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6282 }
6283
6284 /*
6285 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6286 */
6287 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6288 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6289 {
6290 if ( fIeOnAllNaNs
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6293 {
6294 fFsw |= X86_FSW_IE;
6295 if (!(fFcw & X86_FCW_IM))
6296 fFsw |= X86_FSW_ES | X86_FSW_B;
6297 }
6298 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6299 }
6300
6301 /*
6302 * Normalize the values.
6303 */
6304 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6305 {
6306 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6307 iExponent1 = 1;
6308 else
6309 {
6310 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6311 uMantissa1 <<= iExponent1;
6312 iExponent1 = 1 - iExponent1;
6313 }
6314 fFsw |= X86_FSW_DE;
6315 if (!(fFcw & X86_FCW_DM))
6316 fFsw |= X86_FSW_ES | X86_FSW_B;
6317 }
6318
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6322 iExponent2 = 1;
6323 else
6324 {
6325 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6326 uMantissa2 <<= iExponent2;
6327 iExponent2 = 1 - iExponent2;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 /*
6335 * Test if equal (val1 == val2):
6336 */
6337 if ( uMantissa1 == uMantissa2
6338 && iExponent1 == iExponent2
6339 && ( fSign1 == fSign2
6340 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6341 fFsw |= X86_FSW_C3;
6342 /*
6343 * Test if less than (val1 < val2):
6344 */
6345 else if (fSign1 && !fSign2)
6346 fFsw |= X86_FSW_C0;
6347 else if (fSign1 == fSign2)
6348 {
6349 /* Zeros are problematic, however at the most one can be zero here. */
6350 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6351 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6352 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6353 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6354
6355 if ( fSign1
6356 ^ ( iExponent1 < iExponent2
6357 || ( iExponent1 == iExponent2
6358 && uMantissa1 < uMantissa2 ) ) )
6359 fFsw |= X86_FSW_C0;
6360 }
6361 /* else: No flags set if greater. */
6362
6363 return fFsw;
6364}
6365
6366
6367IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6368 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6369{
6370 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6371}
6372
6373
6374
6375
6376IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6377 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6378{
6379 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6380}
6381
6382
6383IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6384 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6385{
6386 RTFLOAT80U r80Val2;
6387 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6388 Assert(!fFsw || fFsw == X86_FSW_DE);
6389 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6390 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6391 {
6392 if (!(pFpuState->FCW & X86_FCW_DM))
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 *pfFsw |= fFsw;
6395 }
6396}
6397
6398
6399IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6400 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6401{
6402 RTFLOAT80U r80Val2;
6403 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6404 Assert(!fFsw || fFsw == X86_FSW_DE);
6405 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6406 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6407 {
6408 if (!(pFpuState->FCW & X86_FCW_DM))
6409 fFsw |= X86_FSW_ES | X86_FSW_B;
6410 *pfFsw |= fFsw;
6411 }
6412}
6413
6414
6415IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6416 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6417{
6418 RTFLOAT80U r80Val2;
6419 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6420 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6421}
6422
6423
6424IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6425 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6426{
6427 RTFLOAT80U r80Val2;
6428 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6429 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6430}
6431
6432
6433/**
6434 * Worker for fcomi & fucomi.
6435 */
6436static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6437 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6438{
6439 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6440 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6441 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6442 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6443
6444 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6445 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6446 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6447}
6448
6449
6450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6452{
6453 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6454}
6455
6456
6457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6458 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6459{
6460 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6461}
6462
6463
6464/*********************************************************************************************************************************
6465* x87 FPU Other Operations *
6466*********************************************************************************************************************************/
6467
6468/**
6469 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6470 */
6471static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6472{
6473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6474 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6475 true /*exact / generate #PE */, &SoftState));
6476 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6477}
6478
6479
6480IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6481{
6482 uint16_t const fFcw = pFpuState->FCW;
6483 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6484
6485 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6486 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6487 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6488 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6489 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6490 || RTFLOAT80U_IS_INF(pr80Val))
6491 pFpuRes->r80Result = *pr80Val;
6492 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6493 {
6494 fFsw |= X86_FSW_DE;
6495 if (fFcw & X86_FCW_DM)
6496 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6497 else
6498 {
6499 pFpuRes->r80Result = *pr80Val;
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 if (fFcw & X86_FCW_IM)
6506 {
6507 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6508 pFpuRes->r80Result = g_r80Indefinite;
6509 else
6510 {
6511 pFpuRes->r80Result = *pr80Val;
6512 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6513 }
6514 }
6515 else
6516 {
6517 pFpuRes->r80Result = *pr80Val;
6518 fFsw |= X86_FSW_ES | X86_FSW_B;
6519 }
6520 fFsw |= X86_FSW_IE;
6521 }
6522 pFpuRes->FSW = fFsw;
6523}
6524
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6528{
6529 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6530 it does everything we need it to do. */
6531 uint16_t const fFcw = pFpuState->FCW;
6532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6534 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6535 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6536}
6537
6538
6539/**
6540 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6541 */
6542static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6543{
6544 Assert(!pr80Val->s.fSign);
6545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6546 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6547 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6548}
6549
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 uint16_t const fFcw = pFpuState->FCW;
6554 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6555
6556 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6557 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6558 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6559 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6560 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6561 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6562 pFpuRes->r80Result = *pr80Val;
6563 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6564 {
6565 fFsw |= X86_FSW_DE;
6566 if (fFcw & X86_FCW_DM)
6567 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6568 else
6569 {
6570 pFpuRes->r80Result = *pr80Val;
6571 fFsw |= X86_FSW_ES | X86_FSW_B;
6572 }
6573 }
6574 else
6575 {
6576 if (fFcw & X86_FCW_IM)
6577 {
6578 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6579 pFpuRes->r80Result = g_r80Indefinite;
6580 else
6581 {
6582 pFpuRes->r80Result = *pr80Val;
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585 }
6586 else
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 fFsw |= X86_FSW_ES | X86_FSW_B;
6590 }
6591 fFsw |= X86_FSW_IE;
6592 }
6593 pFpuRes->FSW = fFsw;
6594}
6595
6596
6597/**
6598 * @code{.unparsed}
6599 * x x * ln2
6600 * f(x) = 2 - 1 = e - 1
6601 *
6602 * @endcode
6603 *
6604 * We can approximate e^x by a Taylor/Maclaurin series (see
6605 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6606 * @code{.unparsed}
6607 * n 0 1 2 3 4
6608 * inf x x x x x x
6609 * SUM ----- = --- + --- + --- + --- + --- + ...
6610 * n=0 n! 0! 1! 2! 3! 4!
6611 *
6612 * 2 3 4
6613 * x x x
6614 * = 1 + x + --- + --- + --- + ...
6615 * 2! 3! 4!
6616 * @endcode
6617 *
6618 * Given z = x * ln2, we get:
6619 * @code{.unparsed}
6620 * 2 3 4 n
6621 * z z z z z
6622 * e - 1 = z + --- + --- + --- + ... + ---
6623 * 2! 3! 4! n!
6624 * @endcode
6625 *
6626 * Wanting to use Horner's method, we move one z outside and get:
6627 * @code{.unparsed}
6628 * 2 3 (n-1)
6629 * z z z z
6630 * = z ( 1 + --- + --- + --- + ... + ------- )
6631 * 2! 3! 4! n!
6632 * @endcode
6633 *
6634 * The constants we need for using Horner's methods are 1 and 1 / n!.
6635 *
6636 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6637 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6638 * and can approximate it to be 1.0. For a visual demonstration of this
6639 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6640 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6641 *
6642 *
6643 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6644 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6645 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6646 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6647 * blocks). (The one bit difference is probably an implicit one missing from
6648 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6649 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6650 * exponent.
6651 *
6652 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6653 * successfully reproduced the exact results from an Intel 10980XE, there is
6654 * always a portition of rounding differences. Not going to spend too much time
6655 * on getting this 100% the same, at least not now.
6656 *
6657 * P.S. If someone are really curious about 8087 and its contstants:
6658 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6659 *
6660 *
6661 * @param pr80Val The exponent value (x), less than 1.0, greater than
6662 * -1.0 and not zero. This can be a normal, denormal
6663 * or pseudo-denormal value.
6664 * @param pr80Result Where to return the result.
6665 * @param fFcw FPU control word.
6666 * @param fFsw FPU status word.
6667 */
6668static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6669{
6670 /* As mentioned above, we can skip the expensive polynomial calculation
6671 as it will be close enough to 1.0 that it makes no difference.
6672
6673 The cutoff point for intel 10980XE is exponents >= -69. Intel
6674 also seems to be using a 67-bit or 68-bit constant value, and we get
6675 a smattering of rounding differences if we go for higher precision. */
6676 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6677 {
6678 RTUINT256U u256;
6679 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6680 u256.QWords.qw0 |= 1; /* force #PE */
6681 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6682 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6683 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6684 : 1 - RTFLOAT80U_EXP_BIAS,
6685 fFcw, fFsw);
6686 }
6687 else
6688 {
6689#ifdef IEM_WITH_FLOAT128_FOR_FPU
6690 /* This approach is not good enough for small values, we end up with zero. */
6691 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6692 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6693 _Float128 rd128Result = powf128(2.0L, rd128Val);
6694 rd128Result -= 1.0L;
6695 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6696 iemFpuF128RestoreRounding(fOldRounding);
6697
6698# else
6699 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6700 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6701
6702 /* As mentioned above, enforce 68-bit internal mantissa width to better
6703 match the Intel 10980XE results. */
6704 unsigned const cPrecision = 68;
6705
6706 /* first calculate z = x * ln2 */
6707 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6708 cPrecision);
6709
6710 /* Then do the polynomial evaluation. */
6711 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6712 cPrecision, &SoftState);
6713 r = f128_mul(z, r, &SoftState);
6714
6715 /* Output the result. */
6716 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6717# endif
6718 }
6719 return fFsw;
6720}
6721
6722
6723IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6724{
6725 uint16_t const fFcw = pFpuState->FCW;
6726 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6727
6728 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6729 {
6730 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6731 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6732 else
6733 {
6734 /* Special case:
6735 2^+1.0 - 1.0 = 1.0
6736 2^-1.0 - 1.0 = -0.5 */
6737 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6738 && pr80Val->s.uMantissa == RT_BIT_64(63))
6739 {
6740 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6741 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6742 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6743 }
6744 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6745 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6746 else
6747 pFpuRes->r80Result = *pr80Val;
6748 fFsw |= X86_FSW_PE;
6749 if (!(fFcw & X86_FCW_PM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752 }
6753 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6754 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6755 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6756 pFpuRes->r80Result = *pr80Val;
6757 else if (RTFLOAT80U_IS_INF(pr80Val))
6758 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6759 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6760 {
6761 fFsw |= X86_FSW_DE;
6762 if (fFcw & X86_FCW_DM)
6763 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6764 else
6765 {
6766 pFpuRes->r80Result = *pr80Val;
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 }
6770 else
6771 {
6772 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6773 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6774 && (fFcw & X86_FCW_IM))
6775 pFpuRes->r80Result = g_r80Indefinite;
6776 else
6777 {
6778 pFpuRes->r80Result = *pr80Val;
6779 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6780 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6781 }
6782 fFsw |= X86_FSW_IE;
6783 if (!(fFcw & X86_FCW_IM))
6784 fFsw |= X86_FSW_ES | X86_FSW_B;
6785 }
6786 pFpuRes->FSW = fFsw;
6787}
6788
6789#endif /* IEM_WITHOUT_ASSEMBLY */
6790
6791IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6792{
6793 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6797{
6798 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6799}
6800
6801#ifdef IEM_WITHOUT_ASSEMBLY
6802
6803IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6804{
6805 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6806 pFpuRes->r80Result = *pr80Val;
6807 pFpuRes->r80Result.s.fSign = 0;
6808}
6809
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6814 pFpuRes->r80Result = *pr80Val;
6815 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6816}
6817
6818
6819IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6820{
6821 uint16_t const fFcw = pFpuState->FCW;
6822 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6823
6824 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6825 {
6826 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6827 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6828
6829 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6830 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6831 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6832 }
6833 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6834 {
6835 fFsw |= X86_FSW_ZE;
6836 if (fFcw & X86_FCW_ZM)
6837 {
6838 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6839 pFpuResTwo->r80Result2 = *pr80Val;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result2 = *pr80Val;
6844 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6845 }
6846 }
6847 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6848 {
6849 fFsw |= X86_FSW_DE;
6850 if (fFcw & X86_FCW_DM)
6851 {
6852 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6853 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6854 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6855 int32_t iExponent = -16382;
6856 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6857 {
6858 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6859 iExponent--;
6860 }
6861
6862 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6863 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6864 }
6865 else
6866 {
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6869 }
6870 }
6871 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6872 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6873 {
6874 pFpuResTwo->r80Result1 = *pr80Val;
6875 pFpuResTwo->r80Result2 = *pr80Val;
6876 }
6877 else if (RTFLOAT80U_IS_INF(pr80Val))
6878 {
6879 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6880 pFpuResTwo->r80Result2 = *pr80Val;
6881 }
6882 else
6883 {
6884 if (fFcw & X86_FCW_IM)
6885 {
6886 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 pFpuResTwo->r80Result1 = g_r80Indefinite;
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = *pr80Val;
6891 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6892 }
6893 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6894 }
6895 else
6896 {
6897 pFpuResTwo->r80Result2 = *pr80Val;
6898 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6899 }
6900 fFsw |= X86_FSW_IE;
6901 }
6902 pFpuResTwo->FSW = fFsw;
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6908{
6909 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6910 AssertReleaseFailed();
6911}
6912
6913#endif /* IEM_WITHOUT_ASSEMBLY */
6914
6915IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6916 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6917{
6918 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6919}
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925}
6926
6927#if defined(IEM_WITHOUT_ASSEMBLY)
6928
6929IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6931{
6932 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6933 AssertReleaseFailed();
6934}
6935
6936#endif /* IEM_WITHOUT_ASSEMBLY */
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6939 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6940{
6941 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6942}
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948}
6949
6950
6951/*********************************************************************************************************************************
6952* MMX, SSE & AVX *
6953*********************************************************************************************************************************/
6954
6955/*
6956 * MOVSLDUP / VMOVSLDUP
6957 */
6958IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
6959{
6960 puDst->au32[0] = puSrc->au32[0];
6961 puDst->au32[1] = puSrc->au32[0];
6962 puDst->au32[2] = puSrc->au32[2];
6963 puDst->au32[3] = puSrc->au32[2];
6964}
6965
6966#ifdef IEM_WITH_VEX
6967
6968IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6969{
6970 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6971 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6972 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6973 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6975 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6976 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6977 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6978}
6979
6980
6981IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6982{
6983 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6984 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6985 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6986 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6988 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6989 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6990 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6991}
6992
6993#endif /* IEM_WITH_VEX */
6994
6995
6996/*
6997 * MOVSHDUP / VMOVSHDUP
6998 */
6999IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7000{
7001 puDst->au32[0] = puSrc->au32[1];
7002 puDst->au32[1] = puSrc->au32[1];
7003 puDst->au32[2] = puSrc->au32[3];
7004 puDst->au32[3] = puSrc->au32[3];
7005}
7006
7007#ifdef IEM_WITH_VEX
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7010{
7011 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7012 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7013 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7014 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7015 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7016 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7018 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7019}
7020
7021
7022IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7023{
7024 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7025 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7026 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7027 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7028 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7029 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7030 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7031 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7032}
7033
7034#endif /* IEM_WITH_VEX */
7035
7036
7037/*
7038 * MOVDDUP / VMOVDDUP
7039 */
7040IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7041{
7042 puDst->au64[0] = uSrc;
7043 puDst->au64[1] = uSrc;
7044}
7045
7046#ifdef IEM_WITH_VEX
7047
7048IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7049{
7050 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7051 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7052 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7053 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7054}
7055
7056IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7057{
7058 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7059 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7060 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7061 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7062}
7063
7064#endif /* IEM_WITH_VEX */
7065
7066
7067/*
7068 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7069 */
7070#ifdef IEM_WITHOUT_ASSEMBLY
7071
7072IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7073{
7074 RT_NOREF(pFpuState);
7075 *puDst &= *puSrc;
7076}
7077
7078
7079IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7080{
7081 RT_NOREF(pFpuState);
7082 puDst->au64[0] &= puSrc->au64[0];
7083 puDst->au64[1] &= puSrc->au64[1];
7084}
7085
7086#endif
7087
7088IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7089 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7090{
7091 RT_NOREF(pExtState);
7092 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7093 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7094}
7095
7096
7097IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7098 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7099{
7100 RT_NOREF(pExtState);
7101 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7102 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7103 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7104 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7105}
7106
7107
7108/*
7109 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7110 */
7111#ifdef IEM_WITHOUT_ASSEMBLY
7112
7113IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7114{
7115 RT_NOREF(pFpuState);
7116 *puDst = ~*puDst & *puSrc;
7117}
7118
7119
7120IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7121{
7122 RT_NOREF(pFpuState);
7123 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7124 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7125}
7126
7127#endif
7128
7129IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7130 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7131{
7132 RT_NOREF(pExtState);
7133 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7134 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7135}
7136
7137
7138IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7139 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7140{
7141 RT_NOREF(pExtState);
7142 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7143 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7144 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7145 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7146}
7147
7148
7149/*
7150 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7151 */
7152#ifdef IEM_WITHOUT_ASSEMBLY
7153
7154IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7155{
7156 RT_NOREF(pFpuState);
7157 *puDst |= *puSrc;
7158}
7159
7160
7161IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7162{
7163 RT_NOREF(pFpuState);
7164 puDst->au64[0] |= puSrc->au64[0];
7165 puDst->au64[1] |= puSrc->au64[1];
7166}
7167
7168#endif
7169
7170IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7171 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7172{
7173 RT_NOREF(pExtState);
7174 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7175 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7176}
7177
7178
7179IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7180 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7181{
7182 RT_NOREF(pExtState);
7183 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7184 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7185 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7186 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7187}
7188
7189
7190/*
7191 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7192 */
7193#ifdef IEM_WITHOUT_ASSEMBLY
7194
7195IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7196{
7197 RT_NOREF(pFpuState);
7198 *puDst ^= *puSrc;
7199}
7200
7201
7202IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7203{
7204 RT_NOREF(pFpuState);
7205 puDst->au64[0] ^= puSrc->au64[0];
7206 puDst->au64[1] ^= puSrc->au64[1];
7207}
7208
7209#endif
7210
7211IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7212 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7213{
7214 RT_NOREF(pExtState);
7215 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7216 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7217}
7218
7219
7220IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7221 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7222{
7223 RT_NOREF(pExtState);
7224 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7225 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7226 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7227 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7228}
7229
7230
7231/*
7232 * PCMPEQB / VPCMPEQB
7233 */
7234#ifdef IEM_WITHOUT_ASSEMBLY
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7237{
7238 RT_NOREF(pFpuState);
7239 RTUINT64U uSrc1 = { *puDst };
7240 RTUINT64U uSrc2 = { *puSrc };
7241 RTUINT64U uDst;
7242 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7243 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7244 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7245 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7246 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7247 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7248 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7249 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7250 *puDst = uDst.u;
7251}
7252
7253
7254IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7255{
7256 RT_NOREF(pFpuState);
7257 RTUINT128U uSrc1 = *puDst;
7258 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7259 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7260 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7261 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7262 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7263 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7264 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7265 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7266 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7267 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7268 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7269 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7270 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7271 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7272 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7273 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7274}
7275
7276#endif
7277
7278IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7280{
7281 RT_NOREF(pExtState);
7282 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7283 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7284 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7285 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7286 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7287 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7288 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7289 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7290 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7291 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7292 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7293 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7294 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7295 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7296 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7297 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7298}
7299
7300IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7301 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7302{
7303 RT_NOREF(pExtState);
7304 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7305 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7306 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7307 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7308 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7309 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7310 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7311 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7312 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7313 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7314 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7315 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7316 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7317 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7318 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7319 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7320 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7321 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7322 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7323 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7324 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7325 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7326 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7327 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7328 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7329 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7330 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7331 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7332 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7333 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7334 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7335 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7336}
7337
7338
7339/*
7340 * PCMPEQW / VPCMPEQW
7341 */
7342#ifdef IEM_WITHOUT_ASSEMBLY
7343
7344IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7345{
7346 RT_NOREF(pFpuState);
7347 RTUINT64U uSrc1 = { *puDst };
7348 RTUINT64U uSrc2 = { *puSrc };
7349 RTUINT64U uDst;
7350 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7351 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7352 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7353 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7354 *puDst = uDst.u;
7355}
7356
7357
7358IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7359{
7360 RT_NOREF(pFpuState);
7361 RTUINT128U uSrc1 = *puDst;
7362 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7363 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7364 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7365 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7366 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7367 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7368 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7369 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7370}
7371
7372#endif
7373
7374IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7375 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7376{
7377 RT_NOREF(pExtState);
7378 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7379 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7380 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7381 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7382 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7383 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7384 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7385 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7386}
7387
7388IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7389 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7390{
7391 RT_NOREF(pExtState);
7392 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7393 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7394 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7395 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7396 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7397 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7398 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7399 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7400 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7401 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7402 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7403 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7404 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7405 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7406 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7407 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7408}
7409
7410
7411/*
7412 * PCMPEQD / VPCMPEQD.
7413 */
7414#ifdef IEM_WITHOUT_ASSEMBLY
7415
7416IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7417{
7418 RT_NOREF(pFpuState);
7419 RTUINT64U uSrc1 = { *puDst };
7420 RTUINT64U uSrc2 = { *puSrc };
7421 RTUINT64U uDst;
7422 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7423 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7424 *puDst = uDst.u;
7425}
7426
7427
7428IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7429{
7430 RT_NOREF(pFpuState);
7431 RTUINT128U uSrc1 = *puDst;
7432 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7433 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7434 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7435 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7436}
7437
7438#endif /* IEM_WITHOUT_ASSEMBLY */
7439
7440IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7441 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7442{
7443 RT_NOREF(pExtState);
7444 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7445 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7446 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7447 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7448}
7449
7450IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7452{
7453 RT_NOREF(pExtState);
7454 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7455 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7456 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7457 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7458 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7459 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7460 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7461 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7462}
7463
7464
7465/*
7466 * PCMPEQQ / VPCMPEQQ.
7467 */
7468IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7469{
7470 RT_NOREF(pFpuState);
7471 RTUINT128U uSrc1 = *puDst;
7472 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7473 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7474}
7475
7476IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7477 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7478{
7479 RT_NOREF(pExtState);
7480 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7481 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7482}
7483
7484IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7485 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7486{
7487 RT_NOREF(pExtState);
7488 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7489 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7490 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7491 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7492}
7493
7494
7495/*
7496 * PCMPGTB / VPCMPGTB
7497 */
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7501{
7502 RT_NOREF(pFpuState);
7503 RTUINT64U uSrc1 = { *puDst };
7504 RTUINT64U uSrc2 = { *puSrc };
7505 RTUINT64U uDst;
7506 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7507 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7508 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7509 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7510 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7511 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7512 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7513 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7514 *puDst = uDst.u;
7515}
7516
7517
7518IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7519{
7520 RT_NOREF(pFpuState);
7521 RTUINT128U uSrc1 = *puDst;
7522 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7523 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7524 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7525 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7526 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7527 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7528 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7529 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7530 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7531 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7532 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7533 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7534 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7535 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7536 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7537 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7538}
7539
7540#endif
7541
7542IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7543 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7544{
7545 RT_NOREF(pExtState);
7546 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7547 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7548 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7549 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7550 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7551 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7552 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7553 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7554 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7555 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7556 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7557 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7558 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7559 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7560 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7561 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7562}
7563
7564IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7565 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7566{
7567 RT_NOREF(pExtState);
7568 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7569 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7570 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7571 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7572 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7573 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7574 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7575 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7576 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7577 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7578 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7579 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7580 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7581 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7582 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7583 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7584 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7585 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7586 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7587 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7588 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7589 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7590 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7591 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7592 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7593 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7594 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7595 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7596 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7597 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7598 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7599 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7600}
7601
7602
7603/*
7604 * PCMPGTW / VPCMPGTW
7605 */
7606#ifdef IEM_WITHOUT_ASSEMBLY
7607
7608IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7609{
7610 RT_NOREF(pFpuState);
7611 RTUINT64U uSrc1 = { *puDst };
7612 RTUINT64U uSrc2 = { *puSrc };
7613 RTUINT64U uDst;
7614 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7615 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7616 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7617 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7618 *puDst = uDst.u;
7619}
7620
7621
7622IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7623{
7624 RT_NOREF(pFpuState);
7625 RTUINT128U uSrc1 = *puDst;
7626 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7627 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7628 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7629 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7630 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7631 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7632 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7633 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7634}
7635
7636#endif
7637
7638IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7639 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7640{
7641 RT_NOREF(pExtState);
7642 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7643 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7644 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7645 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7646 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7647 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7648 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7649 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7650}
7651
7652IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7653 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7654{
7655 RT_NOREF(pExtState);
7656 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7657 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7658 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7659 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7660 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7661 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7662 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7663 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7664 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7665 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7666 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7667 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7668 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7669 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7670 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7671 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7672}
7673
7674
7675/*
7676 * PCMPGTD / VPCMPGTD.
7677 */
7678#ifdef IEM_WITHOUT_ASSEMBLY
7679
7680IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7681{
7682 RT_NOREF(pFpuState);
7683 RTUINT64U uSrc1 = { *puDst };
7684 RTUINT64U uSrc2 = { *puSrc };
7685 RTUINT64U uDst;
7686 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7687 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7688 *puDst = uDst.u;
7689}
7690
7691
7692IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7693{
7694 RT_NOREF(pFpuState);
7695 RTUINT128U uSrc1 = *puDst;
7696 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7697 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7698 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7699 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7700}
7701
7702#endif /* IEM_WITHOUT_ASSEMBLY */
7703
7704IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7705 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7706{
7707 RT_NOREF(pExtState);
7708 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7709 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7710 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7711 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7712}
7713
7714IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7715 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7716{
7717 RT_NOREF(pExtState);
7718 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7719 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7720 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7721 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7722 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7723 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7724 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7725 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7726}
7727
7728
7729/*
7730 * PCMPGTQ / VPCMPGTQ.
7731 */
7732IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7733{
7734 RT_NOREF(pFpuState);
7735 RTUINT128U uSrc1 = *puDst;
7736 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7737 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7738}
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7745 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7746}
7747
7748IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7749 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7750{
7751 RT_NOREF(pExtState);
7752 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7753 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7754 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7755 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7756}
7757
7758
7759/*
7760 * PADDB / VPADDB
7761 */
7762#ifdef IEM_WITHOUT_ASSEMBLY
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7765{
7766 RT_NOREF(pFpuState);
7767 RTUINT64U uSrc1 = { *puDst };
7768 RTUINT64U uSrc2 = { *puSrc };
7769 RTUINT64U uDst;
7770 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7771 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7772 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7773 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7774 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7775 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7776 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7777 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7778 *puDst = uDst.u;
7779}
7780
7781
7782IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7783{
7784 RT_NOREF(pFpuState);
7785 RTUINT128U uSrc1 = *puDst;
7786 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7787 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7788 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7789 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7790 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7791 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7792 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7793 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7794 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7795 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7796 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7797 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7798 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7799 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7800 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7801 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7802}
7803
7804#endif
7805
7806
7807IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7808 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7809{
7810 RT_NOREF(pExtState);
7811 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7812 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7813 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7814 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7815 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7816 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7817 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7818 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7819 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7820 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7821 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7822 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7823 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7824 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7825 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7826 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7827}
7828
7829IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7830 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7831{
7832 RT_NOREF(pExtState);
7833 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7834 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7835 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7836 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7837 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7838 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7839 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7840 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7841 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7842 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7843 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7844 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7845 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7846 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7847 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7848 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7849 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7850 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7851 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7852 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7853 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7854 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7855 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7856 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7857 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7858 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7859 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7860 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7861 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7862 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7863 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7864 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7865}
7866
7867
7868/*
7869 * PADDSB / VPADDSB
7870 */
7871#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
7872 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
7873 ? (uint8_t)(a_iWord) \
7874 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
7875
7876#ifdef IEM_WITHOUT_ASSEMBLY
7877
7878IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7879{
7880 RT_NOREF(pFpuState);
7881 RTUINT64U uSrc1 = { *puDst };
7882 RTUINT64U uSrc2 = { *puSrc };
7883 RTUINT64U uDst;
7884 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
7885 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
7886 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
7887 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
7888 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
7889 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
7890 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
7891 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
7892 *puDst = uDst.u;
7893}
7894
7895
7896IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7897{
7898 RT_NOREF(pFpuState);
7899 RTUINT128U uSrc1 = *puDst;
7900 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
7901 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
7902 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
7903 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
7904 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
7905 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
7906 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
7907 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
7908 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
7909 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
7910 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
7911 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
7912 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
7913 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
7914 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
7915 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
7916}
7917
7918#endif
7919
7920
7921/*
7922 * PADDSB / VPADDSB
7923 */
7924#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
7925 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
7926 ? (uint8_t)(a_uWord) \
7927 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
7928
7929#ifdef IEM_WITHOUT_ASSEMBLY
7930
7931IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7932{
7933 RT_NOREF(pFpuState);
7934 RTUINT64U uSrc1 = { *puDst };
7935 RTUINT64U uSrc2 = { *puSrc };
7936 RTUINT64U uDst;
7937 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
7938 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
7939 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
7940 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
7941 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
7942 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
7943 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
7944 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
7945 *puDst = uDst.u;
7946}
7947
7948
7949IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7950{
7951 RT_NOREF(pFpuState);
7952 RTUINT128U uSrc1 = *puDst;
7953 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
7954 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
7955 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
7956 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
7957 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
7958 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
7959 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
7960 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
7961 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
7962 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
7963 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
7964 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
7965 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
7966 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
7967 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
7968 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
7969}
7970
7971#endif
7972
7973
7974/*
7975 * PADDW / VPADDW
7976 */
7977#ifdef IEM_WITHOUT_ASSEMBLY
7978
7979IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7980{
7981 RT_NOREF(pFpuState);
7982 RTUINT64U uSrc1 = { *puDst };
7983 RTUINT64U uSrc2 = { *puSrc };
7984 RTUINT64U uDst;
7985 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
7986 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
7987 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
7988 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
7989 *puDst = uDst.u;
7990}
7991
7992
7993IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7994{
7995 RT_NOREF(pFpuState);
7996 RTUINT128U uSrc1 = *puDst;
7997 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
7998 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
7999 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8000 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8001 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8002 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8003 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8004 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8005}
8006
8007#endif
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8011 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8012{
8013 RT_NOREF(pExtState);
8014 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8015 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8016 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8017 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8018 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8019 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8020 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8021 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8022}
8023
8024IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8025 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8026{
8027 RT_NOREF(pExtState);
8028 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8029 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8030 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8031 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8032 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8033 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8034 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8035 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8036 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8037 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8038 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8039 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8040 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8041 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8042 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8043 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8044}
8045
8046
8047/*
8048 * PADDSW / VPADDSW
8049 */
8050#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8051 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8052 ? (uint16_t)(a_iDword) \
8053 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8054
8055#ifdef IEM_WITHOUT_ASSEMBLY
8056
8057IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8058{
8059 RT_NOREF(pFpuState);
8060 RTUINT64U uSrc1 = { *puDst };
8061 RTUINT64U uSrc2 = { *puSrc };
8062 RTUINT64U uDst;
8063 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8064 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8065 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8066 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8067 *puDst = uDst.u;
8068}
8069
8070
8071IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8072{
8073 RT_NOREF(pFpuState);
8074 RTUINT128U uSrc1 = *puDst;
8075 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8076 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8077 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8078 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8079 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8080 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8081 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8082 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8083}
8084
8085#endif
8086
8087
8088/*
8089 * PADDUSW / VPADDUSW
8090 */
8091#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8092 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8093 ? (uint16_t)(a_uDword) \
8094 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8095
8096#ifdef IEM_WITHOUT_ASSEMBLY
8097
8098IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8099{
8100 RT_NOREF(pFpuState);
8101 RTUINT64U uSrc1 = { *puDst };
8102 RTUINT64U uSrc2 = { *puSrc };
8103 RTUINT64U uDst;
8104 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8105 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8106 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8107 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8108 *puDst = uDst.u;
8109}
8110
8111
8112IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8113{
8114 RT_NOREF(pFpuState);
8115 RTUINT128U uSrc1 = *puDst;
8116 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8117 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8118 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8119 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8120 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8121 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8122 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8123 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8124}
8125
8126#endif
8127
8128
8129/*
8130 * PADDD / VPADDD.
8131 */
8132#ifdef IEM_WITHOUT_ASSEMBLY
8133
8134IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8135{
8136 RT_NOREF(pFpuState);
8137 RTUINT64U uSrc1 = { *puDst };
8138 RTUINT64U uSrc2 = { *puSrc };
8139 RTUINT64U uDst;
8140 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8141 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8142 *puDst = uDst.u;
8143}
8144
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8147{
8148 RT_NOREF(pFpuState);
8149 RTUINT128U uSrc1 = *puDst;
8150 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8151 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8152 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8153 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8154}
8155
8156#endif /* IEM_WITHOUT_ASSEMBLY */
8157
8158IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8159 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8160{
8161 RT_NOREF(pExtState);
8162 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8163 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8164 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8165 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8166}
8167
8168IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8169 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8170{
8171 RT_NOREF(pExtState);
8172 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8173 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8174 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8175 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8176 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8177 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8178 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8179 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8180}
8181
8182
8183/*
8184 * PADDQ / VPADDQ.
8185 */
8186#ifdef IEM_WITHOUT_ASSEMBLY
8187
8188IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8189{
8190 RT_NOREF(pFpuState);
8191 *puDst = *puDst + *puSrc;
8192}
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8195{
8196 RT_NOREF(pFpuState);
8197 RTUINT128U uSrc1 = *puDst;
8198 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8199 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8200}
8201
8202#endif
8203
8204IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8205 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8206{
8207 RT_NOREF(pExtState);
8208 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8209 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8210}
8211
8212IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8213 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8214{
8215 RT_NOREF(pExtState);
8216 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8217 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8218 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8219 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8220}
8221
8222
8223/*
8224 * PSUBB / VPSUBB
8225 */
8226#ifdef IEM_WITHOUT_ASSEMBLY
8227
8228IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8229{
8230 RT_NOREF(pFpuState);
8231 RTUINT64U uSrc1 = { *puDst };
8232 RTUINT64U uSrc2 = { *puSrc };
8233 RTUINT64U uDst;
8234 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8235 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8236 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8237 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8238 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8239 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8240 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8241 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8242 *puDst = uDst.u;
8243}
8244
8245
8246IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8247{
8248 RT_NOREF(pFpuState);
8249 RTUINT128U uSrc1 = *puDst;
8250 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8251 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8252 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8253 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8254 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8255 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8256 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8257 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8258 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8259 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8260 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8261 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8262 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8263 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8264 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8265 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8266}
8267
8268#endif
8269
8270IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8271 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8272{
8273 RT_NOREF(pExtState);
8274 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8275 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8276 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8277 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8278 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8279 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8280 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8281 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8282 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8283 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8284 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8285 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8286 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8287 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8288 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8289 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8290}
8291
8292IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8293 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8294{
8295 RT_NOREF(pExtState);
8296 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8297 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8298 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8299 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8300 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8301 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8302 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8303 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8304 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8305 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8306 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8307 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8308 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8309 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8310 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8311 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8312 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8313 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8314 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8315 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8316 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8317 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8318 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8319 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8320 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8321 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8322 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8323 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8324 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8325 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8326 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8327 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8328}
8329
8330
8331/*
8332 * PSUBSB / VSUBSB
8333 */
8334#ifdef IEM_WITHOUT_ASSEMBLY
8335
8336IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8337{
8338 RT_NOREF(pFpuState);
8339 RTUINT64U uSrc1 = { *puDst };
8340 RTUINT64U uSrc2 = { *puSrc };
8341 RTUINT64U uDst;
8342 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8343 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8344 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8345 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8346 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8347 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8348 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8349 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8350 *puDst = uDst.u;
8351}
8352
8353
8354IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8355{
8356 RT_NOREF(pFpuState);
8357 RTUINT128U uSrc1 = *puDst;
8358 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8359 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8360 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8361 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8362 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8363 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8364 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8365 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8366 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8367 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8368 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8369 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8370 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8371 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8372 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8373 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8374}
8375
8376#endif
8377
8378
8379/*
8380 * PADDSB / VPADDSB
8381 */
8382#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8383 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8384 ? (uint8_t)(a_uWord) \
8385 : (uint8_t)0 )
8386
8387#ifdef IEM_WITHOUT_ASSEMBLY
8388
8389IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8390{
8391 RT_NOREF(pFpuState);
8392 RTUINT64U uSrc1 = { *puDst };
8393 RTUINT64U uSrc2 = { *puSrc };
8394 RTUINT64U uDst;
8395 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8396 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8397 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8398 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8399 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8400 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8401 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8402 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8403 *puDst = uDst.u;
8404}
8405
8406
8407IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8408{
8409 RT_NOREF(pFpuState);
8410 RTUINT128U uSrc1 = *puDst;
8411 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8412 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8413 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8414 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8415 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8416 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8417 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8418 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8419 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8420 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8421 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8422 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8423 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8424 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8425 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8426 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8427}
8428
8429#endif
8430
8431
8432/*
8433 * PSUBW / VPSUBW
8434 */
8435#ifdef IEM_WITHOUT_ASSEMBLY
8436
8437IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8438{
8439 RT_NOREF(pFpuState);
8440 RTUINT64U uSrc1 = { *puDst };
8441 RTUINT64U uSrc2 = { *puSrc };
8442 RTUINT64U uDst;
8443 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8444 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8445 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8446 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8447 *puDst = uDst.u;
8448}
8449
8450
8451IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8452{
8453 RT_NOREF(pFpuState);
8454 RTUINT128U uSrc1 = *puDst;
8455 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8456 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8457 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8458 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8459 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8460 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8461 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8462 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8463}
8464
8465#endif
8466
8467IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8468 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8469{
8470 RT_NOREF(pExtState);
8471 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8472 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8473 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8474 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8475 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8476 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8477 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8478 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8479}
8480
8481IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8482 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8483{
8484 RT_NOREF(pExtState);
8485 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8486 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8487 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8488 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8489 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8490 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8491 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8492 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8493 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8494 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8495 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8496 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8497 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8498 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8499 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8500 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8501}
8502
8503
8504/*
8505 * PSUBSW / VPSUBSW
8506 */
8507#ifdef IEM_WITHOUT_ASSEMBLY
8508
8509IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8510{
8511 RT_NOREF(pFpuState);
8512 RTUINT64U uSrc1 = { *puDst };
8513 RTUINT64U uSrc2 = { *puSrc };
8514 RTUINT64U uDst;
8515 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
8516 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
8517 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
8518 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
8519 *puDst = uDst.u;
8520}
8521
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8524{
8525 RT_NOREF(pFpuState);
8526 RTUINT128U uSrc1 = *puDst;
8527 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
8528 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
8529 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
8530 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
8531 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
8532 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
8533 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
8534 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
8535}
8536
8537#endif
8538
8539
8540/*
8541 * PSUBUSW / VPSUBUSW
8542 */
8543#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
8544 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8545 ? (uint16_t)(a_uDword) \
8546 : (uint16_t)0 )
8547
8548#ifdef IEM_WITHOUT_ASSEMBLY
8549
8550IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8551{
8552 RT_NOREF(pFpuState);
8553 RTUINT64U uSrc1 = { *puDst };
8554 RTUINT64U uSrc2 = { *puSrc };
8555 RTUINT64U uDst;
8556 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
8557 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
8558 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
8559 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
8560 *puDst = uDst.u;
8561}
8562
8563
8564IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8565{
8566 RT_NOREF(pFpuState);
8567 RTUINT128U uSrc1 = *puDst;
8568 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
8569 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
8570 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
8571 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
8572 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
8573 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
8574 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
8575 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
8576}
8577
8578#endif
8579
8580
8581/*
8582 * PSUBD / VPSUBD.
8583 */
8584#ifdef IEM_WITHOUT_ASSEMBLY
8585
8586IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8587{
8588 RT_NOREF(pFpuState);
8589 RTUINT64U uSrc1 = { *puDst };
8590 RTUINT64U uSrc2 = { *puSrc };
8591 RTUINT64U uDst;
8592 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
8593 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
8594 *puDst = uDst.u;
8595}
8596
8597
8598IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8599{
8600 RT_NOREF(pFpuState);
8601 RTUINT128U uSrc1 = *puDst;
8602 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
8603 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
8604 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
8605 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
8606}
8607
8608#endif /* IEM_WITHOUT_ASSEMBLY */
8609
8610IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8611 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8612{
8613 RT_NOREF(pExtState);
8614 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8615 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8616 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8617 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8618}
8619
8620IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8621 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8622{
8623 RT_NOREF(pExtState);
8624 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8625 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8626 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8627 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8628 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
8629 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
8630 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
8631 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
8632}
8633
8634
8635/*
8636 * PSUBQ / VPSUBQ.
8637 */
8638#ifdef IEM_WITHOUT_ASSEMBLY
8639
8640IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8641{
8642 RT_NOREF(pFpuState);
8643 *puDst = *puDst - *puSrc;
8644}
8645
8646IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8647{
8648 RT_NOREF(pFpuState);
8649 RTUINT128U uSrc1 = *puDst;
8650 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
8651 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
8652}
8653
8654#endif
8655
8656IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8657 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8658{
8659 RT_NOREF(pExtState);
8660 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8661 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8662}
8663
8664IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8665 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8666{
8667 RT_NOREF(pExtState);
8668 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8669 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8670 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
8671 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
8672}
8673
8674
8675
8676/*
8677 * PMULLW / VPMULLW / PMULLD / VPMULLD
8678 */
8679#ifdef IEM_WITHOUT_ASSEMBLY
8680
8681IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8682{
8683 RT_NOREF(pFpuState);
8684 RTUINT64U uSrc1 = { *puDst };
8685 RTUINT64U uSrc2 = { *puSrc };
8686 RTUINT64U uDst;
8687 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
8688 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
8689 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
8690 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
8691 *puDst = uDst.u;
8692}
8693
8694
8695IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8696{
8697 RT_NOREF(pFpuState);
8698 RTUINT128U uSrc1 = *puDst;
8699 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
8700 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
8701 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
8702 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
8703 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
8704 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
8705 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
8706 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
8707}
8708
8709#endif
8710
8711IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8712{
8713 RTUINT128U uSrc1 = *puDst;
8714
8715 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
8716 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
8717 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
8718 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
8719 RT_NOREF(pFpuState);
8720}
8721
8722
8723IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8724{
8725 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
8726 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
8727 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
8728 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
8729 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
8730 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
8731 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
8732 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
8733}
8734
8735
8736IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8737{
8738 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
8739 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
8740 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
8741 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
8742 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
8743 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
8744 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
8745 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
8746 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
8747 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
8748 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
8749 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
8750 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
8751 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
8752 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
8753 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
8754}
8755
8756
8757IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8758{
8759 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
8760 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
8761 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
8762 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
8763}
8764
8765
8766IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8767{
8768 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
8769 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
8770 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
8771 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
8772 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
8773 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
8774 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
8775 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
8776}
8777
8778
8779/*
8780 * PMULHW / VPMULHW
8781 */
8782#ifdef IEM_WITHOUT_ASSEMBLY
8783
8784IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8785{
8786 RT_NOREF(pFpuState);
8787 RTUINT64U uSrc1 = { *puDst };
8788 RTUINT64U uSrc2 = { *puSrc };
8789 RTUINT64U uDst;
8790 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
8791 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
8792 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
8793 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
8794 *puDst = uDst.u;
8795}
8796
8797
8798IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8799{
8800 RT_NOREF(pFpuState);
8801 RTUINT128U uSrc1 = *puDst;
8802 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
8803 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
8804 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
8805 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
8806 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
8807 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
8808 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
8809 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
8810}
8811
8812#endif
8813
8814IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8815{
8816 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
8817 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
8818 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
8819 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
8820 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
8821 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
8822 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
8823 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
8824}
8825
8826
8827IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8828{
8829 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
8830 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
8831 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
8832 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
8833 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
8834 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
8835 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
8836 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
8837 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
8838 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
8839 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
8840 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
8841 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
8842 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
8843 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
8844 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
8845}
8846
8847
8848/*
8849 * PMULHUW / VPMULHUW
8850 */
8851#ifdef IEM_WITHOUT_ASSEMBLY
8852
8853IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8854{
8855 RTUINT64U uSrc1 = { *puDst };
8856 RTUINT64U uSrc2 = { *puSrc };
8857 RTUINT64U uDst;
8858 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
8859 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
8860 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
8861 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
8862 *puDst = uDst.u;
8863}
8864
8865
8866IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8867{
8868 RTUINT128U uSrc1 = *puDst;
8869 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
8870 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
8871 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
8872 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
8873 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
8874 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
8875 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
8876 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
8877}
8878
8879#endif
8880
8881IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8882{
8883 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
8884 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
8885 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
8886 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
8887 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
8888 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
8889 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
8890 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
8891}
8892
8893
8894IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8895{
8896 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
8897 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
8898 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
8899 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
8900 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
8901 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
8902 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
8903 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
8904 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
8905 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
8906 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
8907 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
8908 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
8909 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
8910 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
8911 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
8912}
8913
8914
8915/*
8916 * PSRLW / VPSRLW
8917 */
8918#ifdef IEM_WITHOUT_ASSEMBLY
8919
8920IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8921{
8922 RTUINT64U uSrc1 = { *puDst };
8923 RTUINT64U uSrc2 = { *puSrc };
8924 RTUINT64U uDst;
8925
8926 if (uSrc2.au64[0] <= 15)
8927 {
8928 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
8929 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
8930 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
8931 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
8932 }
8933 else
8934 {
8935 uDst.au64[0] = 0;
8936 }
8937 *puDst = uDst.u;
8938}
8939
8940
8941IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8942{
8943 RTUINT64U uSrc1 = { *puDst };
8944 RTUINT64U uDst;
8945
8946 if (uShift <= 15)
8947 {
8948 uDst.au16[0] = uSrc1.au16[0] >> uShift;
8949 uDst.au16[1] = uSrc1.au16[1] >> uShift;
8950 uDst.au16[2] = uSrc1.au16[2] >> uShift;
8951 uDst.au16[3] = uSrc1.au16[3] >> uShift;
8952 }
8953 else
8954 {
8955 uDst.au64[0] = 0;
8956 }
8957 *puDst = uDst.u;
8958}
8959
8960
8961IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8962{
8963 RTUINT128U uSrc1 = *puDst;
8964
8965 if (puSrc->au64[0] <= 15)
8966 {
8967 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
8968 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
8969 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
8970 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
8971 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
8972 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
8973 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
8974 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
8975 }
8976 else
8977 {
8978 puDst->au64[0] = 0;
8979 puDst->au64[1] = 0;
8980 }
8981}
8982
8983IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8984{
8985 RTUINT128U uSrc1 = *puDst;
8986
8987 if (uShift <= 15)
8988 {
8989 puDst->au16[0] = uSrc1.au16[0] >> uShift;
8990 puDst->au16[1] = uSrc1.au16[1] >> uShift;
8991 puDst->au16[2] = uSrc1.au16[2] >> uShift;
8992 puDst->au16[3] = uSrc1.au16[3] >> uShift;
8993 puDst->au16[4] = uSrc1.au16[4] >> uShift;
8994 puDst->au16[5] = uSrc1.au16[5] >> uShift;
8995 puDst->au16[6] = uSrc1.au16[6] >> uShift;
8996 puDst->au16[7] = uSrc1.au16[7] >> uShift;
8997 }
8998 else
8999 {
9000 puDst->au64[0] = 0;
9001 puDst->au64[1] = 0;
9002 }
9003}
9004
9005#endif
9006
9007
9008/*
9009 * PSRAW / VPSRAW
9010 */
9011#ifdef IEM_WITHOUT_ASSEMBLY
9012
9013IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9014{
9015 RTUINT64U uSrc1 = { *puDst };
9016 RTUINT64U uSrc2 = { *puSrc };
9017 RTUINT64U uDst;
9018
9019 if (uSrc2.au64[0] <= 15)
9020 {
9021 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9022 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9023 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9024 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9025 }
9026 else
9027 {
9028 uDst.au64[0] = 0;
9029 }
9030 *puDst = uDst.u;
9031}
9032
9033
9034IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9035{
9036 RTUINT64U uSrc1 = { *puDst };
9037 RTUINT64U uDst;
9038
9039 if (uShift <= 15)
9040 {
9041 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9042 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9043 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9044 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9045 }
9046 else
9047 {
9048 uDst.au64[0] = 0;
9049 }
9050 *puDst = uDst.u;
9051}
9052
9053
9054IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9055{
9056 RTUINT128U uSrc1 = *puDst;
9057
9058 if (puSrc->au64[0] <= 15)
9059 {
9060 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9061 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9062 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9063 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9064 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9065 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9066 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9067 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9068 }
9069 else
9070 {
9071 puDst->au64[0] = 0;
9072 puDst->au64[1] = 0;
9073 }
9074}
9075
9076IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9077{
9078 RTUINT128U uSrc1 = *puDst;
9079
9080 if (uShift <= 15)
9081 {
9082 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9083 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9084 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9085 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9086 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9087 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9088 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9089 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9090 }
9091 else
9092 {
9093 puDst->au64[0] = 0;
9094 puDst->au64[1] = 0;
9095 }
9096}
9097
9098#endif
9099
9100
9101/*
9102 * PSLLW / VPSLLW
9103 */
9104#ifdef IEM_WITHOUT_ASSEMBLY
9105
9106IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9107{
9108 RTUINT64U uSrc1 = { *puDst };
9109 RTUINT64U uSrc2 = { *puSrc };
9110 RTUINT64U uDst;
9111
9112 if (uSrc2.au64[0] <= 15)
9113 {
9114 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9115 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9116 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9117 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9118 }
9119 else
9120 {
9121 uDst.au64[0] = 0;
9122 }
9123 *puDst = uDst.u;
9124}
9125
9126
9127IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9128{
9129 RTUINT64U uSrc1 = { *puDst };
9130 RTUINT64U uDst;
9131
9132 if (uShift <= 15)
9133 {
9134 uDst.au16[0] = uSrc1.au16[0] << uShift;
9135 uDst.au16[1] = uSrc1.au16[1] << uShift;
9136 uDst.au16[2] = uSrc1.au16[2] << uShift;
9137 uDst.au16[3] = uSrc1.au16[3] << uShift;
9138 }
9139 else
9140 {
9141 uDst.au64[0] = 0;
9142 }
9143 *puDst = uDst.u;
9144}
9145
9146
9147IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9148{
9149 RTUINT128U uSrc1 = *puDst;
9150
9151 if (puSrc->au64[0] <= 15)
9152 {
9153 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9154 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9155 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9156 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9157 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9158 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9159 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9160 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9161 }
9162 else
9163 {
9164 puDst->au64[0] = 0;
9165 puDst->au64[1] = 0;
9166 }
9167}
9168
9169IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9170{
9171 RTUINT128U uSrc1 = *puDst;
9172
9173 if (uShift <= 15)
9174 {
9175 puDst->au16[0] = uSrc1.au16[0] << uShift;
9176 puDst->au16[1] = uSrc1.au16[1] << uShift;
9177 puDst->au16[2] = uSrc1.au16[2] << uShift;
9178 puDst->au16[3] = uSrc1.au16[3] << uShift;
9179 puDst->au16[4] = uSrc1.au16[4] << uShift;
9180 puDst->au16[5] = uSrc1.au16[5] << uShift;
9181 puDst->au16[6] = uSrc1.au16[6] << uShift;
9182 puDst->au16[7] = uSrc1.au16[7] << uShift;
9183 }
9184 else
9185 {
9186 puDst->au64[0] = 0;
9187 puDst->au64[1] = 0;
9188 }
9189}
9190
9191#endif
9192
9193
9194/*
9195 * PSRLD / VPSRLD
9196 */
9197#ifdef IEM_WITHOUT_ASSEMBLY
9198
9199IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9200{
9201 RTUINT64U uSrc1 = { *puDst };
9202 RTUINT64U uSrc2 = { *puSrc };
9203 RTUINT64U uDst;
9204
9205 if (uSrc2.au64[0] <= 31)
9206 {
9207 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9208 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9209 }
9210 else
9211 {
9212 uDst.au64[0] = 0;
9213 }
9214 *puDst = uDst.u;
9215}
9216
9217
9218IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9219{
9220 RTUINT64U uSrc1 = { *puDst };
9221 RTUINT64U uDst;
9222
9223 if (uShift <= 31)
9224 {
9225 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9226 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9227 }
9228 else
9229 {
9230 uDst.au64[0] = 0;
9231 }
9232 *puDst = uDst.u;
9233}
9234
9235
9236IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9237{
9238 RTUINT128U uSrc1 = *puDst;
9239
9240 if (puSrc->au64[0] <= 31)
9241 {
9242 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9243 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9244 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9245 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9246 }
9247 else
9248 {
9249 puDst->au64[0] = 0;
9250 puDst->au64[1] = 0;
9251 }
9252}
9253
9254IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9255{
9256 RTUINT128U uSrc1 = *puDst;
9257
9258 if (uShift <= 31)
9259 {
9260 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9261 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9262 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9263 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9264 }
9265 else
9266 {
9267 puDst->au64[0] = 0;
9268 puDst->au64[1] = 0;
9269 }
9270}
9271
9272#endif
9273
9274
9275/*
9276 * PSRAD / VPSRAD
9277 */
9278#ifdef IEM_WITHOUT_ASSEMBLY
9279
9280IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9281{
9282 RTUINT64U uSrc1 = { *puDst };
9283 RTUINT64U uSrc2 = { *puSrc };
9284 RTUINT64U uDst;
9285
9286 if (uSrc2.au64[0] <= 31)
9287 {
9288 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9289 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9290 }
9291 else
9292 {
9293 uDst.au64[0] = 0;
9294 }
9295 *puDst = uDst.u;
9296}
9297
9298
9299IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9300{
9301 RTUINT64U uSrc1 = { *puDst };
9302 RTUINT64U uDst;
9303
9304 if (uShift <= 31)
9305 {
9306 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9307 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9308 }
9309 else
9310 {
9311 uDst.au64[0] = 0;
9312 }
9313 *puDst = uDst.u;
9314}
9315
9316
9317IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9318{
9319 RTUINT128U uSrc1 = *puDst;
9320
9321 if (puSrc->au64[0] <= 31)
9322 {
9323 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9324 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9325 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9326 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9327 }
9328 else
9329 {
9330 puDst->au64[0] = 0;
9331 puDst->au64[1] = 0;
9332 }
9333}
9334
9335IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9336{
9337 RTUINT128U uSrc1 = *puDst;
9338
9339 if (uShift <= 31)
9340 {
9341 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9342 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9343 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9344 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9345 }
9346 else
9347 {
9348 puDst->au64[0] = 0;
9349 puDst->au64[1] = 0;
9350 }
9351}
9352
9353#endif
9354
9355
9356/*
9357 * PSLLD / VPSLLD
9358 */
9359#ifdef IEM_WITHOUT_ASSEMBLY
9360
9361IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9362{
9363 RTUINT64U uSrc1 = { *puDst };
9364 RTUINT64U uSrc2 = { *puSrc };
9365 RTUINT64U uDst;
9366
9367 if (uSrc2.au64[0] <= 31)
9368 {
9369 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9370 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9371 }
9372 else
9373 {
9374 uDst.au64[0] = 0;
9375 }
9376 *puDst = uDst.u;
9377}
9378
9379
9380IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9381{
9382 RTUINT64U uSrc1 = { *puDst };
9383 RTUINT64U uDst;
9384
9385 if (uShift <= 31)
9386 {
9387 uDst.au32[0] = uSrc1.au32[0] << uShift;
9388 uDst.au32[1] = uSrc1.au32[1] << uShift;
9389 }
9390 else
9391 {
9392 uDst.au64[0] = 0;
9393 }
9394 *puDst = uDst.u;
9395}
9396
9397
9398IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9399{
9400 RTUINT128U uSrc1 = *puDst;
9401
9402 if (puSrc->au64[0] <= 31)
9403 {
9404 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9405 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9406 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9407 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9408 }
9409 else
9410 {
9411 puDst->au64[0] = 0;
9412 puDst->au64[1] = 0;
9413 }
9414}
9415
9416IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9417{
9418 RTUINT128U uSrc1 = *puDst;
9419
9420 if (uShift <= 31)
9421 {
9422 puDst->au32[0] = uSrc1.au32[0] << uShift;
9423 puDst->au32[1] = uSrc1.au32[1] << uShift;
9424 puDst->au32[2] = uSrc1.au32[2] << uShift;
9425 puDst->au32[3] = uSrc1.au32[3] << uShift;
9426 }
9427 else
9428 {
9429 puDst->au64[0] = 0;
9430 puDst->au64[1] = 0;
9431 }
9432}
9433
9434#endif
9435
9436
9437/*
9438 * PSRLQ / VPSRLQ
9439 */
9440#ifdef IEM_WITHOUT_ASSEMBLY
9441
9442IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9443{
9444 RTUINT64U uSrc1 = { *puDst };
9445 RTUINT64U uSrc2 = { *puSrc };
9446 RTUINT64U uDst;
9447
9448 if (uSrc2.au64[0] <= 63)
9449 {
9450 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9451 }
9452 else
9453 {
9454 uDst.au64[0] = 0;
9455 }
9456 *puDst = uDst.u;
9457}
9458
9459
9460IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9461{
9462 RTUINT64U uSrc1 = { *puDst };
9463 RTUINT64U uDst;
9464
9465 if (uShift <= 63)
9466 {
9467 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9468 }
9469 else
9470 {
9471 uDst.au64[0] = 0;
9472 }
9473 *puDst = uDst.u;
9474}
9475
9476
9477IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9478{
9479 RTUINT128U uSrc1 = *puDst;
9480
9481 if (puSrc->au64[0] <= 63)
9482 {
9483 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9484 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9485 }
9486 else
9487 {
9488 puDst->au64[0] = 0;
9489 puDst->au64[1] = 0;
9490 }
9491}
9492
9493IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9494{
9495 RTUINT128U uSrc1 = *puDst;
9496
9497 if (uShift <= 63)
9498 {
9499 puDst->au64[0] = uSrc1.au64[0] >> uShift;
9500 puDst->au64[1] = uSrc1.au64[1] >> uShift;
9501 }
9502 else
9503 {
9504 puDst->au64[0] = 0;
9505 puDst->au64[1] = 0;
9506 }
9507}
9508
9509#endif
9510
9511
9512/*
9513 * PSLLQ / VPSLLQ
9514 */
9515#ifdef IEM_WITHOUT_ASSEMBLY
9516
9517IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9518{
9519 RTUINT64U uSrc1 = { *puDst };
9520 RTUINT64U uSrc2 = { *puSrc };
9521 RTUINT64U uDst;
9522
9523 if (uSrc2.au64[0] <= 63)
9524 {
9525 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
9526 }
9527 else
9528 {
9529 uDst.au64[0] = 0;
9530 }
9531 *puDst = uDst.u;
9532}
9533
9534
9535IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9536{
9537 RTUINT64U uSrc1 = { *puDst };
9538 RTUINT64U uDst;
9539
9540 if (uShift <= 63)
9541 {
9542 uDst.au64[0] = uSrc1.au64[0] << uShift;
9543 }
9544 else
9545 {
9546 uDst.au64[0] = 0;
9547 }
9548 *puDst = uDst.u;
9549}
9550
9551
9552IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9553{
9554 RTUINT128U uSrc1 = *puDst;
9555
9556 if (puSrc->au64[0] <= 63)
9557 {
9558 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
9559 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
9560 }
9561 else
9562 {
9563 puDst->au64[0] = 0;
9564 puDst->au64[1] = 0;
9565 }
9566}
9567
9568IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9569{
9570 RTUINT128U uSrc1 = *puDst;
9571
9572 if (uShift <= 63)
9573 {
9574 puDst->au64[0] = uSrc1.au64[0] << uShift;
9575 puDst->au64[1] = uSrc1.au64[1] << uShift;
9576 }
9577 else
9578 {
9579 puDst->au64[0] = 0;
9580 puDst->au64[1] = 0;
9581 }
9582}
9583
9584#endif
9585
9586
9587/*
9588 * PSRLDQ / VPSRLDQ
9589 */
9590#ifdef IEM_WITHOUT_ASSEMBLY
9591
9592IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9593{
9594 RTUINT128U uSrc1 = *puDst;
9595
9596 if (uShift < 16)
9597 {
9598 int i;
9599
9600 for (i = 0; i < 16 - uShift; ++i)
9601 puDst->au8[i] = uSrc1.au8[i + uShift];
9602 for (i = 16 - uShift; i < 16; ++i)
9603 puDst->au8[i] = 0;
9604 }
9605 else
9606 {
9607 puDst->au64[0] = 0;
9608 puDst->au64[1] = 0;
9609 }
9610}
9611
9612#endif
9613
9614
9615/*
9616 * PSLLDQ / VPSLLDQ
9617 */
9618#ifdef IEM_WITHOUT_ASSEMBLY
9619
9620IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9621{
9622 RTUINT128U uSrc1 = *puDst;
9623
9624 if (uShift < 16)
9625 {
9626 int i;
9627
9628 for (i = 0; i < uShift; ++i)
9629 puDst->au8[i] = 0;
9630 for (i = uShift; i < 16; ++i)
9631 puDst->au8[i] = uSrc1.au8[i - uShift];
9632 }
9633 else
9634 {
9635 puDst->au64[0] = 0;
9636 puDst->au64[1] = 0;
9637 }
9638}
9639
9640#endif
9641
9642
9643/*
9644 * PMADDWD / VPMADDWD
9645 */
9646#ifdef IEM_WITHOUT_ASSEMBLY
9647
9648IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9649{
9650 RTUINT64U uSrc1 = { *puDst };
9651 RTUINT64U uSrc2 = { *puSrc };
9652 RTUINT64U uDst;
9653
9654 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
9655 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
9656 *puDst = uDst.u;
9657 RT_NOREF(pFpuState);
9658}
9659
9660
9661IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9662{
9663 RTUINT128U uSrc1 = *puDst;
9664
9665 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
9666 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
9667 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
9668 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
9669 RT_NOREF(pFpuState);
9670}
9671
9672#endif
9673
9674
9675/*
9676 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
9677 */
9678#ifdef IEM_WITHOUT_ASSEMBLY
9679
9680IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9681{
9682 RTUINT64U uSrc1 = { *puDst };
9683 RTUINT64U uSrc2 = { *puSrc };
9684 RTUINT64U uDst;
9685
9686 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
9687 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
9688 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
9689 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
9690 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
9691 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
9692 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
9693 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
9694 *puDst = uDst.u;
9695 RT_NOREF(pFpuState);
9696}
9697
9698
9699IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9700{
9701 RTUINT128U uSrc1 = *puDst;
9702
9703 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
9704 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
9705 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
9706 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
9707 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
9708 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
9709 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
9710 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
9711 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
9712 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
9713 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
9714 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
9715 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
9716 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
9717 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
9718 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
9719 RT_NOREF(pFpuState);
9720}
9721
9722#endif
9723
9724
9725IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9726{
9727 RTUINT128U uSrc1 = *puDst;
9728
9729 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
9730 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
9731 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
9732 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
9733 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
9734 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
9735 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
9736 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
9737 RT_NOREF(pFpuState);
9738}
9739
9740
9741IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9742{
9743 RTUINT128U uSrc1 = *puDst;
9744
9745 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
9746 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
9747 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
9748 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
9749 RT_NOREF(pFpuState);
9750}
9751
9752
9753IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9754 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9755{
9756 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9757 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9758 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9759 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9760 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9761 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9762 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9763 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9764 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9765 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9766 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9767 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9768 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9769 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9770 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9771 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9772 RT_NOREF(pExtState);
9773}
9774
9775
9776IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9777 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9778{
9779 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9780 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9781 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9782 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9783 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9784 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9785 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9786 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9787 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9788 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9789 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9790 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9791 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9792 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9793 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9794 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9795 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
9796 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
9797 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
9798 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
9799 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
9800 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
9801 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
9802 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
9803 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
9804 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
9805 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
9806 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
9807 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
9808 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
9809 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
9810 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
9811 RT_NOREF(pExtState);
9812}
9813
9814
9815IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9816 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9817{
9818 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9819 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9820 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9821 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9822 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9823 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9824 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9825 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9826 RT_NOREF(pExtState);
9827}
9828
9829
9830IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9831 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9832{
9833 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9834 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9835 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9836 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9837 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9838 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9839 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9840 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9841 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
9842 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
9843 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
9844 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
9845 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
9846 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
9847 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
9848 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
9849 RT_NOREF(pExtState);
9850}
9851
9852
9853IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9854 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9855{
9856 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9857 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9858 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9859 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9860 RT_NOREF(pExtState);
9861}
9862
9863
9864IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9865 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9866{
9867 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9868 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9869 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9870 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9871 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
9872 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
9873 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
9874 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
9875 RT_NOREF(pExtState);
9876}
9877
9878
9879/*
9880 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
9881 */
9882#ifdef IEM_WITHOUT_ASSEMBLY
9883
9884IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9885{
9886 RTUINT64U uSrc1 = { *puDst };
9887 RTUINT64U uSrc2 = { *puSrc };
9888 RTUINT64U uDst;
9889
9890 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
9891 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
9892 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
9893 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
9894 *puDst = uDst.u;
9895 RT_NOREF(pFpuState);
9896}
9897
9898
9899IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9900{
9901 RTUINT128U uSrc1 = *puDst;
9902
9903 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
9904 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
9905 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
9906 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
9907 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
9908 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
9909 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
9910 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
9911 RT_NOREF(pFpuState);
9912}
9913
9914#endif
9915
9916IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9917{
9918 RTUINT128U uSrc1 = *puDst;
9919
9920 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
9921 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
9922 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
9923 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
9924 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
9925 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
9926 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
9927 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
9928 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
9929 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
9930 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
9931 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
9932 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
9933 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
9934 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
9935 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
9936 RT_NOREF(pFpuState);
9937}
9938
9939
9940IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9941{
9942 RTUINT128U uSrc1 = *puDst;
9943
9944 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
9945 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
9946 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
9947 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
9948 RT_NOREF(pFpuState);
9949}
9950
9951
9952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9953 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9954{
9955 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9956 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9957 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9958 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9959 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9960 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9961 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
9962 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
9963 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
9964 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
9965 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
9966 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
9967 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
9968 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
9969 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
9970 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
9971 RT_NOREF(pExtState);
9972}
9973
9974
9975IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9976 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9977{
9978 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9979 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9980 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9981 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9982 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9983 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9984 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
9985 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
9986 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
9987 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
9988 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
9989 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
9990 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
9991 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
9992 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
9993 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
9994 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
9995 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
9996 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
9997 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
9998 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
9999 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10000 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10001 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10002 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10003 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10004 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10005 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10006 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10007 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10008 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10009 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10010 RT_NOREF(pExtState);
10011}
10012
10013
10014IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10015 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10016{
10017 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10018 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10019 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10020 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10021 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10022 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10023 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10024 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10025 RT_NOREF(pExtState);
10026}
10027
10028
10029IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10030 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10031{
10032 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10033 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10034 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10035 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10036 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10037 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10038 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10039 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10040 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10041 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10042 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10043 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10044 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10045 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10046 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10047 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10048 RT_NOREF(pExtState);
10049}
10050
10051
10052IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10053 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10054{
10055 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10056 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10057 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10058 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10059 RT_NOREF(pExtState);
10060}
10061
10062
10063IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10064 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10065{
10066 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10067 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10068 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10069 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10070 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10071 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10072 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10073 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10074 RT_NOREF(pExtState);
10075}
10076
10077
10078/*
10079 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10080 */
10081#ifdef IEM_WITHOUT_ASSEMBLY
10082
10083IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10084{
10085 RTUINT64U uSrc1 = { *puDst };
10086 RTUINT64U uSrc2 = { *puSrc };
10087 RTUINT64U uDst;
10088
10089 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10090 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10091 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10092 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10093 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10094 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10095 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10096 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10097 *puDst = uDst.u;
10098 RT_NOREF(pFpuState);
10099}
10100
10101
10102IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10103{
10104 RTUINT128U uSrc1 = *puDst;
10105
10106 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10107 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10108 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10109 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10110 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10111 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10112 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10113 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10114 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10115 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10116 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10117 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10118 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10119 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10120 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10121 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10122 RT_NOREF(pFpuState);
10123}
10124
10125#endif
10126
10127IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10128{
10129 RTUINT128U uSrc1 = *puDst;
10130
10131 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10132 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10133 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10134 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10135 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10136 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10137 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10138 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10139 RT_NOREF(pFpuState);
10140}
10141
10142
10143IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10144{
10145 RTUINT128U uSrc1 = *puDst;
10146
10147 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10148 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10149 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10150 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10151 RT_NOREF(pFpuState);
10152}
10153
10154
10155IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10156 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10157{
10158 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10159 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10160 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10161 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10162 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10163 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10164 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10165 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10166 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10167 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10168 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10169 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10170 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10171 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10172 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10173 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10174 RT_NOREF(pExtState);
10175}
10176
10177
10178IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10179 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10180{
10181 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10182 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10183 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10184 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10185 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10186 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10187 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10188 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10189 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10190 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10191 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10192 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10193 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10194 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10195 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10196 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10197 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10198 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10199 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10200 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10201 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10202 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10203 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10204 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10205 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10206 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10207 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10208 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10209 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10210 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10211 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10212 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10213 RT_NOREF(pExtState);
10214}
10215
10216
10217IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10218 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10219{
10220 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10221 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10222 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10223 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10224 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10225 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10226 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10227 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10228 RT_NOREF(pExtState);
10229}
10230
10231
10232IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10233 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10234{
10235 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10236 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10237 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10238 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10239 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10240 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10241 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10242 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10243 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10244 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10245 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10246 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10247 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10248 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10249 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10250 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10251 RT_NOREF(pExtState);
10252}
10253
10254
10255IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10256 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10257{
10258 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10259 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10260 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10261 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10262 RT_NOREF(pExtState);
10263}
10264
10265
10266IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10267 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10268{
10269 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10270 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10271 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10272 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10273 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10274 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10275 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10276 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10277 RT_NOREF(pExtState);
10278}
10279
10280
10281/*
10282 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10283 */
10284#ifdef IEM_WITHOUT_ASSEMBLY
10285
10286IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10287{
10288 RTUINT64U uSrc1 = { *puDst };
10289 RTUINT64U uSrc2 = { *puSrc };
10290 RTUINT64U uDst;
10291
10292 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10293 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10294 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10295 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10296 *puDst = uDst.u;
10297 RT_NOREF(pFpuState);
10298}
10299
10300
10301IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10302{
10303 RTUINT128U uSrc1 = *puDst;
10304
10305 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10306 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10307 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10308 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10309 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10310 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10311 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10312 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10313 RT_NOREF(pFpuState);
10314}
10315
10316#endif
10317
10318IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10319{
10320 RTUINT128U uSrc1 = *puDst;
10321
10322 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10323 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10324 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10325 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10326 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10327 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10328 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10329 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10330 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10331 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10332 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10333 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10334 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10335 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10336 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10337 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10338 RT_NOREF(pFpuState);
10339}
10340
10341
10342IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10343{
10344 RTUINT128U uSrc1 = *puDst;
10345
10346 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10347 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10348 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10349 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10350 RT_NOREF(pFpuState);
10351}
10352
10353
10354IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10355 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10356{
10357 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10358 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10359 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10360 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10361 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10362 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10363 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10364 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10365 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10366 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10367 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10368 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10369 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10370 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10371 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10372 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10373 RT_NOREF(pExtState);
10374}
10375
10376
10377IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10378 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10379{
10380 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10381 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10382 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10383 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10384 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10385 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10386 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10387 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10388 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10389 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10390 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10391 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10392 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10393 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10394 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10395 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10396 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10397 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10398 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10399 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10400 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10401 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10402 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10403 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10404 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10405 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10406 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10407 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10408 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10409 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10410 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10411 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10412 RT_NOREF(pExtState);
10413}
10414
10415
10416IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10417 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10418{
10419 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10420 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10421 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10422 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10423 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10424 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10425 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10426 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10427 RT_NOREF(pExtState);
10428}
10429
10430
10431IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10432 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10433{
10434 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10435 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10436 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10437 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10438 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10439 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10440 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10441 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10442 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10443 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10444 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10445 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10446 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10447 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10448 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10449 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10450 RT_NOREF(pExtState);
10451}
10452
10453
10454IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10455 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10456{
10457 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10458 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10459 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10460 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10461 RT_NOREF(pExtState);
10462}
10463
10464
10465IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10466 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10467{
10468 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10469 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10470 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10471 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10472 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10473 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10474 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10475 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10476 RT_NOREF(pExtState);
10477}
10478
10479
10480/*
10481 * PAVGB / VPAVGB / PAVGW / VPAVGW
10482 */
10483#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
10484#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
10485
10486#ifdef IEM_WITHOUT_ASSEMBLY
10487
10488IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10489{
10490 RTUINT64U uSrc1 = { *puDst };
10491 RTUINT64U uSrc2 = { *puSrc };
10492 RTUINT64U uDst;
10493
10494 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
10495 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
10496 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
10497 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
10498 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
10499 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
10500 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
10501 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
10502 *puDst = uDst.u;
10503}
10504
10505
10506IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10507{
10508 RTUINT128U uSrc1 = *puDst;
10509
10510 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10511 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10512 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10513 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10514 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10515 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10516 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10517 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10518 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10519 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10520 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10521 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10522 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10523 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10524 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10525 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10526}
10527
10528
10529IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10530{
10531 RTUINT64U uSrc1 = { *puDst };
10532 RTUINT64U uSrc2 = { *puSrc };
10533 RTUINT64U uDst;
10534
10535 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
10536 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
10537 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
10538 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
10539 *puDst = uDst.u;
10540}
10541
10542
10543IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10544{
10545 RTUINT128U uSrc1 = *puDst;
10546
10547 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
10548 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
10549 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
10550 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
10551 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
10552 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
10553 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
10554 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
10555}
10556
10557#endif
10558
10559IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10560{
10561 RTUINT128U uSrc1 = *puDst;
10562
10563 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10564 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10565 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10566 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10567 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10568 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10569 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10570 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10571 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10572 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10573 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10574 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10575 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10576 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10577 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10578 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10579}
10580
10581
10582IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10583{
10584 RTUINT128U uSrc1 = *puDst;
10585
10586 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10587 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10588 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10589 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10590 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10591 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10592 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10593 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10594 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10595 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10596 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10597 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10598 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10599 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10600 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10601 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10602}
10603
10604
10605IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10606{
10607 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10608 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10609 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10610 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10611 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10612 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10613 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10614 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10615 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10616 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10617 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
10618 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
10619 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
10620 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
10621 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
10622 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
10623}
10624
10625
10626IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10627{
10628 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10629 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10630 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10631 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10632 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10633 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10634 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10635 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10636 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10637 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10638 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
10639 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
10640 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
10641 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
10642 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
10643 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
10644 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
10645 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
10646 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
10647 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
10648 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
10649 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
10650 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
10651 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
10652 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
10653 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
10654 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
10655 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
10656 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
10657 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
10658 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
10659 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
10660}
10661
10662
10663IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10664{
10665 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10666 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10667 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10668 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10669 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10670 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10671 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10672 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10673}
10674
10675
10676IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10677{
10678 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10679 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10680 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10681 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10682 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10683 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10684 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10685 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10686 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10687 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10688 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
10689 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
10690 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
10691 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
10692 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
10693 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
10694}
10695
10696#undef PAVGB_EXEC
10697#undef PAVGW_EXEC
10698
10699
10700/*
10701 * PMOVMSKB / VPMOVMSKB
10702 */
10703#ifdef IEM_WITHOUT_ASSEMBLY
10704
10705IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
10706{
10707 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10708 uint64_t const uSrc = *pu64Src;
10709 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
10710 | ((uSrc >> (15-1)) & RT_BIT_64(1))
10711 | ((uSrc >> (23-2)) & RT_BIT_64(2))
10712 | ((uSrc >> (31-3)) & RT_BIT_64(3))
10713 | ((uSrc >> (39-4)) & RT_BIT_64(4))
10714 | ((uSrc >> (47-5)) & RT_BIT_64(5))
10715 | ((uSrc >> (55-6)) & RT_BIT_64(6))
10716 | ((uSrc >> (63-7)) & RT_BIT_64(7));
10717}
10718
10719
10720IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
10721{
10722 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10723 uint64_t const uSrc0 = pu128Src->QWords.qw0;
10724 uint64_t const uSrc1 = pu128Src->QWords.qw1;
10725 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10726 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10727 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10728 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10729 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10730 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10731 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10732 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10733 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10734 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10735 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10736 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10737 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10738 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10739 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10740 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
10741}
10742
10743#endif
10744
10745IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
10746{
10747 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10748 uint64_t const uSrc0 = puSrc->QWords.qw0;
10749 uint64_t const uSrc1 = puSrc->QWords.qw1;
10750 uint64_t const uSrc2 = puSrc->QWords.qw2;
10751 uint64_t const uSrc3 = puSrc->QWords.qw3;
10752 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10753 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10754 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10755 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10756 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10757 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10758 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10759 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10760 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10761 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10762 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10763 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10764 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10765 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10766 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10767 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
10768 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
10769 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
10770 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
10771 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
10772 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
10773 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
10774 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
10775 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
10776 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
10777 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
10778 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
10779 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
10780 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
10781 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
10782 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
10783 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
10784}
10785
10786
10787/*
10788 * [V]PSHUFB
10789 */
10790
10791IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10792{
10793 RTUINT64U const uSrc = { *puSrc };
10794 RTUINT64U const uDstIn = { *puDst };
10795 ASMCompilerBarrier();
10796 RTUINT64U uDstOut = { 0 };
10797 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
10798 {
10799 uint8_t idxSrc = uSrc.au8[iByte];
10800 if (!(idxSrc & 0x80))
10801 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
10802 }
10803 *puDst = uDstOut.u;
10804 RT_NOREF(pFpuState);
10805}
10806
10807
10808IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10809{
10810 RTUINT128U const uSrc = *puSrc;
10811 RTUINT128U const uDstIn = *puDst;
10812 ASMCompilerBarrier();
10813 puDst->au64[0] = 0;
10814 puDst->au64[1] = 0;
10815 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10816 {
10817 uint8_t idxSrc = uSrc.au8[iByte];
10818 if (!(idxSrc & 0x80))
10819 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
10820 }
10821 RT_NOREF(pFpuState);
10822}
10823
10824
10825IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10826 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10827{
10828 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
10829 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
10830 ASMCompilerBarrier();
10831 puDst->au64[0] = 0;
10832 puDst->au64[1] = 0;
10833 for (unsigned iByte = 0; iByte < 16; iByte++)
10834 {
10835 uint8_t idxSrc = uSrc2.au8[iByte];
10836 if (!(idxSrc & 0x80))
10837 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10838 }
10839 RT_NOREF(pExtState);
10840}
10841
10842
10843IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10844 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10845{
10846 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
10847 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
10848 ASMCompilerBarrier();
10849 puDst->au64[0] = 0;
10850 puDst->au64[1] = 0;
10851 puDst->au64[2] = 0;
10852 puDst->au64[3] = 0;
10853 for (unsigned iByte = 0; iByte < 16; iByte++)
10854 {
10855 uint8_t idxSrc = uSrc2.au8[iByte];
10856 if (!(idxSrc & 0x80))
10857 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10858 }
10859 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10860 {
10861 uint8_t idxSrc = uSrc2.au8[iByte];
10862 if (!(idxSrc & 0x80))
10863 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
10864 }
10865 RT_NOREF(pExtState);
10866}
10867
10868
10869/*
10870 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
10871 */
10872#ifdef IEM_WITHOUT_ASSEMBLY
10873
10874IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
10875{
10876 uint64_t const uSrc = *puSrc;
10877 ASMCompilerBarrier();
10878 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10879 uSrc >> (((bEvil >> 2) & 3) * 16),
10880 uSrc >> (((bEvil >> 4) & 3) * 16),
10881 uSrc >> (((bEvil >> 6) & 3) * 16));
10882}
10883
10884
10885IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10886{
10887 puDst->QWords.qw0 = puSrc->QWords.qw0;
10888 uint64_t const uSrc = puSrc->QWords.qw1;
10889 ASMCompilerBarrier();
10890 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10891 uSrc >> (((bEvil >> 2) & 3) * 16),
10892 uSrc >> (((bEvil >> 4) & 3) * 16),
10893 uSrc >> (((bEvil >> 6) & 3) * 16));
10894}
10895
10896#endif
10897
10898IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10899{
10900 puDst->QWords.qw0 = puSrc->QWords.qw0;
10901 uint64_t const uSrc1 = puSrc->QWords.qw1;
10902 puDst->QWords.qw2 = puSrc->QWords.qw2;
10903 uint64_t const uSrc3 = puSrc->QWords.qw3;
10904 ASMCompilerBarrier();
10905 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
10906 uSrc1 >> (((bEvil >> 2) & 3) * 16),
10907 uSrc1 >> (((bEvil >> 4) & 3) * 16),
10908 uSrc1 >> (((bEvil >> 6) & 3) * 16));
10909 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
10910 uSrc3 >> (((bEvil >> 2) & 3) * 16),
10911 uSrc3 >> (((bEvil >> 4) & 3) * 16),
10912 uSrc3 >> (((bEvil >> 6) & 3) * 16));
10913}
10914
10915#ifdef IEM_WITHOUT_ASSEMBLY
10916IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10917{
10918 puDst->QWords.qw1 = puSrc->QWords.qw1;
10919 uint64_t const uSrc = puSrc->QWords.qw0;
10920 ASMCompilerBarrier();
10921 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10922 uSrc >> (((bEvil >> 2) & 3) * 16),
10923 uSrc >> (((bEvil >> 4) & 3) * 16),
10924 uSrc >> (((bEvil >> 6) & 3) * 16));
10925
10926}
10927#endif
10928
10929
10930IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10931{
10932 puDst->QWords.qw3 = puSrc->QWords.qw3;
10933 uint64_t const uSrc2 = puSrc->QWords.qw2;
10934 puDst->QWords.qw1 = puSrc->QWords.qw1;
10935 uint64_t const uSrc0 = puSrc->QWords.qw0;
10936 ASMCompilerBarrier();
10937 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
10938 uSrc0 >> (((bEvil >> 2) & 3) * 16),
10939 uSrc0 >> (((bEvil >> 4) & 3) * 16),
10940 uSrc0 >> (((bEvil >> 6) & 3) * 16));
10941 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
10942 uSrc2 >> (((bEvil >> 2) & 3) * 16),
10943 uSrc2 >> (((bEvil >> 4) & 3) * 16),
10944 uSrc2 >> (((bEvil >> 6) & 3) * 16));
10945
10946}
10947
10948
10949#ifdef IEM_WITHOUT_ASSEMBLY
10950IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10951{
10952 RTUINT128U const uSrc = *puSrc;
10953 ASMCompilerBarrier();
10954 puDst->au32[0] = uSrc.au32[bEvil & 3];
10955 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
10956 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
10957 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
10958}
10959#endif
10960
10961
10962IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10963{
10964 RTUINT256U const uSrc = *puSrc;
10965 ASMCompilerBarrier();
10966 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
10967 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
10968 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
10969 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
10970 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
10971 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
10972 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
10973 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
10974}
10975
10976
10977/*
10978 * PUNPCKHBW - high bytes -> words
10979 */
10980#ifdef IEM_WITHOUT_ASSEMBLY
10981
10982IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10983{
10984 RTUINT64U const uSrc2 = { *puSrc };
10985 RTUINT64U const uSrc1 = { *puDst };
10986 ASMCompilerBarrier();
10987 RTUINT64U uDstOut;
10988 uDstOut.au8[0] = uSrc1.au8[4];
10989 uDstOut.au8[1] = uSrc2.au8[4];
10990 uDstOut.au8[2] = uSrc1.au8[5];
10991 uDstOut.au8[3] = uSrc2.au8[5];
10992 uDstOut.au8[4] = uSrc1.au8[6];
10993 uDstOut.au8[5] = uSrc2.au8[6];
10994 uDstOut.au8[6] = uSrc1.au8[7];
10995 uDstOut.au8[7] = uSrc2.au8[7];
10996 *puDst = uDstOut.u;
10997}
10998
10999
11000IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11001{
11002 RTUINT128U const uSrc2 = *puSrc;
11003 RTUINT128U const uSrc1 = *puDst;
11004 ASMCompilerBarrier();
11005 RTUINT128U uDstOut;
11006 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11007 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11008 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11009 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11010 uDstOut.au8[ 4] = uSrc1.au8[10];
11011 uDstOut.au8[ 5] = uSrc2.au8[10];
11012 uDstOut.au8[ 6] = uSrc1.au8[11];
11013 uDstOut.au8[ 7] = uSrc2.au8[11];
11014 uDstOut.au8[ 8] = uSrc1.au8[12];
11015 uDstOut.au8[ 9] = uSrc2.au8[12];
11016 uDstOut.au8[10] = uSrc1.au8[13];
11017 uDstOut.au8[11] = uSrc2.au8[13];
11018 uDstOut.au8[12] = uSrc1.au8[14];
11019 uDstOut.au8[13] = uSrc2.au8[14];
11020 uDstOut.au8[14] = uSrc1.au8[15];
11021 uDstOut.au8[15] = uSrc2.au8[15];
11022 *puDst = uDstOut;
11023}
11024
11025#endif
11026
11027IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11028{
11029 RTUINT128U const uSrc2 = *puSrc2;
11030 RTUINT128U const uSrc1 = *puSrc1;
11031 ASMCompilerBarrier();
11032 RTUINT128U uDstOut;
11033 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11034 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11035 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11036 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11037 uDstOut.au8[ 4] = uSrc1.au8[10];
11038 uDstOut.au8[ 5] = uSrc2.au8[10];
11039 uDstOut.au8[ 6] = uSrc1.au8[11];
11040 uDstOut.au8[ 7] = uSrc2.au8[11];
11041 uDstOut.au8[ 8] = uSrc1.au8[12];
11042 uDstOut.au8[ 9] = uSrc2.au8[12];
11043 uDstOut.au8[10] = uSrc1.au8[13];
11044 uDstOut.au8[11] = uSrc2.au8[13];
11045 uDstOut.au8[12] = uSrc1.au8[14];
11046 uDstOut.au8[13] = uSrc2.au8[14];
11047 uDstOut.au8[14] = uSrc1.au8[15];
11048 uDstOut.au8[15] = uSrc2.au8[15];
11049 *puDst = uDstOut;
11050}
11051
11052
11053IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11054{
11055 RTUINT256U const uSrc2 = *puSrc2;
11056 RTUINT256U const uSrc1 = *puSrc1;
11057 ASMCompilerBarrier();
11058 RTUINT256U uDstOut;
11059 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11060 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11061 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11062 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11063 uDstOut.au8[ 4] = uSrc1.au8[10];
11064 uDstOut.au8[ 5] = uSrc2.au8[10];
11065 uDstOut.au8[ 6] = uSrc1.au8[11];
11066 uDstOut.au8[ 7] = uSrc2.au8[11];
11067 uDstOut.au8[ 8] = uSrc1.au8[12];
11068 uDstOut.au8[ 9] = uSrc2.au8[12];
11069 uDstOut.au8[10] = uSrc1.au8[13];
11070 uDstOut.au8[11] = uSrc2.au8[13];
11071 uDstOut.au8[12] = uSrc1.au8[14];
11072 uDstOut.au8[13] = uSrc2.au8[14];
11073 uDstOut.au8[14] = uSrc1.au8[15];
11074 uDstOut.au8[15] = uSrc2.au8[15];
11075 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11076 uDstOut.au8[16] = uSrc1.au8[24];
11077 uDstOut.au8[17] = uSrc2.au8[24];
11078 uDstOut.au8[18] = uSrc1.au8[25];
11079 uDstOut.au8[19] = uSrc2.au8[25];
11080 uDstOut.au8[20] = uSrc1.au8[26];
11081 uDstOut.au8[21] = uSrc2.au8[26];
11082 uDstOut.au8[22] = uSrc1.au8[27];
11083 uDstOut.au8[23] = uSrc2.au8[27];
11084 uDstOut.au8[24] = uSrc1.au8[28];
11085 uDstOut.au8[25] = uSrc2.au8[28];
11086 uDstOut.au8[26] = uSrc1.au8[29];
11087 uDstOut.au8[27] = uSrc2.au8[29];
11088 uDstOut.au8[28] = uSrc1.au8[30];
11089 uDstOut.au8[29] = uSrc2.au8[30];
11090 uDstOut.au8[30] = uSrc1.au8[31];
11091 uDstOut.au8[31] = uSrc2.au8[31];
11092 *puDst = uDstOut;
11093}
11094
11095
11096/*
11097 * PUNPCKHBW - high words -> dwords
11098 */
11099#ifdef IEM_WITHOUT_ASSEMBLY
11100
11101IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11102{
11103 RTUINT64U const uSrc2 = { *puSrc };
11104 RTUINT64U const uSrc1 = { *puDst };
11105 ASMCompilerBarrier();
11106 RTUINT64U uDstOut;
11107 uDstOut.au16[0] = uSrc1.au16[2];
11108 uDstOut.au16[1] = uSrc2.au16[2];
11109 uDstOut.au16[2] = uSrc1.au16[3];
11110 uDstOut.au16[3] = uSrc2.au16[3];
11111 *puDst = uDstOut.u;
11112}
11113
11114
11115IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11116{
11117 RTUINT128U const uSrc2 = *puSrc;
11118 RTUINT128U const uSrc1 = *puDst;
11119 ASMCompilerBarrier();
11120 RTUINT128U uDstOut;
11121 uDstOut.au16[0] = uSrc1.au16[4];
11122 uDstOut.au16[1] = uSrc2.au16[4];
11123 uDstOut.au16[2] = uSrc1.au16[5];
11124 uDstOut.au16[3] = uSrc2.au16[5];
11125 uDstOut.au16[4] = uSrc1.au16[6];
11126 uDstOut.au16[5] = uSrc2.au16[6];
11127 uDstOut.au16[6] = uSrc1.au16[7];
11128 uDstOut.au16[7] = uSrc2.au16[7];
11129 *puDst = uDstOut;
11130}
11131
11132#endif
11133
11134IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11135{
11136 RTUINT128U const uSrc2 = *puSrc2;
11137 RTUINT128U const uSrc1 = *puSrc1;
11138 ASMCompilerBarrier();
11139 RTUINT128U uDstOut;
11140 uDstOut.au16[0] = uSrc1.au16[4];
11141 uDstOut.au16[1] = uSrc2.au16[4];
11142 uDstOut.au16[2] = uSrc1.au16[5];
11143 uDstOut.au16[3] = uSrc2.au16[5];
11144 uDstOut.au16[4] = uSrc1.au16[6];
11145 uDstOut.au16[5] = uSrc2.au16[6];
11146 uDstOut.au16[6] = uSrc1.au16[7];
11147 uDstOut.au16[7] = uSrc2.au16[7];
11148 *puDst = uDstOut;
11149}
11150
11151
11152IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11153{
11154 RTUINT256U const uSrc2 = *puSrc2;
11155 RTUINT256U const uSrc1 = *puSrc1;
11156 ASMCompilerBarrier();
11157 RTUINT256U uDstOut;
11158 uDstOut.au16[0] = uSrc1.au16[4];
11159 uDstOut.au16[1] = uSrc2.au16[4];
11160 uDstOut.au16[2] = uSrc1.au16[5];
11161 uDstOut.au16[3] = uSrc2.au16[5];
11162 uDstOut.au16[4] = uSrc1.au16[6];
11163 uDstOut.au16[5] = uSrc2.au16[6];
11164 uDstOut.au16[6] = uSrc1.au16[7];
11165 uDstOut.au16[7] = uSrc2.au16[7];
11166
11167 uDstOut.au16[8] = uSrc1.au16[12];
11168 uDstOut.au16[9] = uSrc2.au16[12];
11169 uDstOut.au16[10] = uSrc1.au16[13];
11170 uDstOut.au16[11] = uSrc2.au16[13];
11171 uDstOut.au16[12] = uSrc1.au16[14];
11172 uDstOut.au16[13] = uSrc2.au16[14];
11173 uDstOut.au16[14] = uSrc1.au16[15];
11174 uDstOut.au16[15] = uSrc2.au16[15];
11175 *puDst = uDstOut;
11176}
11177
11178
11179/*
11180 * PUNPCKHBW - high dwords -> qword(s)
11181 */
11182#ifdef IEM_WITHOUT_ASSEMBLY
11183
11184IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11185{
11186 RTUINT64U const uSrc2 = { *puSrc };
11187 RTUINT64U const uSrc1 = { *puDst };
11188 ASMCompilerBarrier();
11189 RTUINT64U uDstOut;
11190 uDstOut.au32[0] = uSrc1.au32[1];
11191 uDstOut.au32[1] = uSrc2.au32[1];
11192 *puDst = uDstOut.u;
11193}
11194
11195
11196IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11197{
11198 RTUINT128U const uSrc2 = *puSrc;
11199 RTUINT128U const uSrc1 = *puDst;
11200 ASMCompilerBarrier();
11201 RTUINT128U uDstOut;
11202 uDstOut.au32[0] = uSrc1.au32[2];
11203 uDstOut.au32[1] = uSrc2.au32[2];
11204 uDstOut.au32[2] = uSrc1.au32[3];
11205 uDstOut.au32[3] = uSrc2.au32[3];
11206 *puDst = uDstOut;
11207}
11208
11209#endif
11210
11211IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11212{
11213 RTUINT128U const uSrc2 = *puSrc2;
11214 RTUINT128U const uSrc1 = *puSrc1;
11215 ASMCompilerBarrier();
11216 RTUINT128U uDstOut;
11217 uDstOut.au32[0] = uSrc1.au32[2];
11218 uDstOut.au32[1] = uSrc2.au32[2];
11219 uDstOut.au32[2] = uSrc1.au32[3];
11220 uDstOut.au32[3] = uSrc2.au32[3];
11221 *puDst = uDstOut;
11222}
11223
11224
11225IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11226{
11227 RTUINT256U const uSrc2 = *puSrc2;
11228 RTUINT256U const uSrc1 = *puSrc1;
11229 ASMCompilerBarrier();
11230 RTUINT256U uDstOut;
11231 uDstOut.au32[0] = uSrc1.au32[2];
11232 uDstOut.au32[1] = uSrc2.au32[2];
11233 uDstOut.au32[2] = uSrc1.au32[3];
11234 uDstOut.au32[3] = uSrc2.au32[3];
11235
11236 uDstOut.au32[4] = uSrc1.au32[6];
11237 uDstOut.au32[5] = uSrc2.au32[6];
11238 uDstOut.au32[6] = uSrc1.au32[7];
11239 uDstOut.au32[7] = uSrc2.au32[7];
11240 *puDst = uDstOut;
11241}
11242
11243
11244/*
11245 * PUNPCKHQDQ -> High qwords -> double qword(s).
11246 */
11247#ifdef IEM_WITHOUT_ASSEMBLY
11248IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11249{
11250 RTUINT128U const uSrc2 = *puSrc;
11251 RTUINT128U const uSrc1 = *puDst;
11252 ASMCompilerBarrier();
11253 RTUINT128U uDstOut;
11254 uDstOut.au64[0] = uSrc1.au64[1];
11255 uDstOut.au64[1] = uSrc2.au64[1];
11256 *puDst = uDstOut;
11257}
11258#endif
11259
11260
11261IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11262{
11263 RTUINT128U const uSrc2 = *puSrc2;
11264 RTUINT128U const uSrc1 = *puSrc1;
11265 ASMCompilerBarrier();
11266 RTUINT128U uDstOut;
11267 uDstOut.au64[0] = uSrc1.au64[1];
11268 uDstOut.au64[1] = uSrc2.au64[1];
11269 *puDst = uDstOut;
11270}
11271
11272
11273IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11274{
11275 RTUINT256U const uSrc2 = *puSrc2;
11276 RTUINT256U const uSrc1 = *puSrc1;
11277 ASMCompilerBarrier();
11278 RTUINT256U uDstOut;
11279 uDstOut.au64[0] = uSrc1.au64[1];
11280 uDstOut.au64[1] = uSrc2.au64[1];
11281
11282 uDstOut.au64[2] = uSrc1.au64[3];
11283 uDstOut.au64[3] = uSrc2.au64[3];
11284 *puDst = uDstOut;
11285}
11286
11287
11288/*
11289 * PUNPCKLBW - low bytes -> words
11290 */
11291#ifdef IEM_WITHOUT_ASSEMBLY
11292
11293IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11294{
11295 RTUINT64U const uSrc2 = { *puSrc };
11296 RTUINT64U const uSrc1 = { *puDst };
11297 ASMCompilerBarrier();
11298 RTUINT64U uDstOut;
11299 uDstOut.au8[0] = uSrc1.au8[0];
11300 uDstOut.au8[1] = uSrc2.au8[0];
11301 uDstOut.au8[2] = uSrc1.au8[1];
11302 uDstOut.au8[3] = uSrc2.au8[1];
11303 uDstOut.au8[4] = uSrc1.au8[2];
11304 uDstOut.au8[5] = uSrc2.au8[2];
11305 uDstOut.au8[6] = uSrc1.au8[3];
11306 uDstOut.au8[7] = uSrc2.au8[3];
11307 *puDst = uDstOut.u;
11308}
11309
11310
11311IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11312{
11313 RTUINT128U const uSrc2 = *puSrc;
11314 RTUINT128U const uSrc1 = *puDst;
11315 ASMCompilerBarrier();
11316 RTUINT128U uDstOut;
11317 uDstOut.au8[ 0] = uSrc1.au8[0];
11318 uDstOut.au8[ 1] = uSrc2.au8[0];
11319 uDstOut.au8[ 2] = uSrc1.au8[1];
11320 uDstOut.au8[ 3] = uSrc2.au8[1];
11321 uDstOut.au8[ 4] = uSrc1.au8[2];
11322 uDstOut.au8[ 5] = uSrc2.au8[2];
11323 uDstOut.au8[ 6] = uSrc1.au8[3];
11324 uDstOut.au8[ 7] = uSrc2.au8[3];
11325 uDstOut.au8[ 8] = uSrc1.au8[4];
11326 uDstOut.au8[ 9] = uSrc2.au8[4];
11327 uDstOut.au8[10] = uSrc1.au8[5];
11328 uDstOut.au8[11] = uSrc2.au8[5];
11329 uDstOut.au8[12] = uSrc1.au8[6];
11330 uDstOut.au8[13] = uSrc2.au8[6];
11331 uDstOut.au8[14] = uSrc1.au8[7];
11332 uDstOut.au8[15] = uSrc2.au8[7];
11333 *puDst = uDstOut;
11334}
11335
11336#endif
11337
11338IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11339{
11340 RTUINT128U const uSrc2 = *puSrc2;
11341 RTUINT128U const uSrc1 = *puSrc1;
11342 ASMCompilerBarrier();
11343 RTUINT128U uDstOut;
11344 uDstOut.au8[ 0] = uSrc1.au8[0];
11345 uDstOut.au8[ 1] = uSrc2.au8[0];
11346 uDstOut.au8[ 2] = uSrc1.au8[1];
11347 uDstOut.au8[ 3] = uSrc2.au8[1];
11348 uDstOut.au8[ 4] = uSrc1.au8[2];
11349 uDstOut.au8[ 5] = uSrc2.au8[2];
11350 uDstOut.au8[ 6] = uSrc1.au8[3];
11351 uDstOut.au8[ 7] = uSrc2.au8[3];
11352 uDstOut.au8[ 8] = uSrc1.au8[4];
11353 uDstOut.au8[ 9] = uSrc2.au8[4];
11354 uDstOut.au8[10] = uSrc1.au8[5];
11355 uDstOut.au8[11] = uSrc2.au8[5];
11356 uDstOut.au8[12] = uSrc1.au8[6];
11357 uDstOut.au8[13] = uSrc2.au8[6];
11358 uDstOut.au8[14] = uSrc1.au8[7];
11359 uDstOut.au8[15] = uSrc2.au8[7];
11360 *puDst = uDstOut;
11361}
11362
11363
11364IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11365{
11366 RTUINT256U const uSrc2 = *puSrc2;
11367 RTUINT256U const uSrc1 = *puSrc1;
11368 ASMCompilerBarrier();
11369 RTUINT256U uDstOut;
11370 uDstOut.au8[ 0] = uSrc1.au8[0];
11371 uDstOut.au8[ 1] = uSrc2.au8[0];
11372 uDstOut.au8[ 2] = uSrc1.au8[1];
11373 uDstOut.au8[ 3] = uSrc2.au8[1];
11374 uDstOut.au8[ 4] = uSrc1.au8[2];
11375 uDstOut.au8[ 5] = uSrc2.au8[2];
11376 uDstOut.au8[ 6] = uSrc1.au8[3];
11377 uDstOut.au8[ 7] = uSrc2.au8[3];
11378 uDstOut.au8[ 8] = uSrc1.au8[4];
11379 uDstOut.au8[ 9] = uSrc2.au8[4];
11380 uDstOut.au8[10] = uSrc1.au8[5];
11381 uDstOut.au8[11] = uSrc2.au8[5];
11382 uDstOut.au8[12] = uSrc1.au8[6];
11383 uDstOut.au8[13] = uSrc2.au8[6];
11384 uDstOut.au8[14] = uSrc1.au8[7];
11385 uDstOut.au8[15] = uSrc2.au8[7];
11386 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11387 uDstOut.au8[16] = uSrc1.au8[16];
11388 uDstOut.au8[17] = uSrc2.au8[16];
11389 uDstOut.au8[18] = uSrc1.au8[17];
11390 uDstOut.au8[19] = uSrc2.au8[17];
11391 uDstOut.au8[20] = uSrc1.au8[18];
11392 uDstOut.au8[21] = uSrc2.au8[18];
11393 uDstOut.au8[22] = uSrc1.au8[19];
11394 uDstOut.au8[23] = uSrc2.au8[19];
11395 uDstOut.au8[24] = uSrc1.au8[20];
11396 uDstOut.au8[25] = uSrc2.au8[20];
11397 uDstOut.au8[26] = uSrc1.au8[21];
11398 uDstOut.au8[27] = uSrc2.au8[21];
11399 uDstOut.au8[28] = uSrc1.au8[22];
11400 uDstOut.au8[29] = uSrc2.au8[22];
11401 uDstOut.au8[30] = uSrc1.au8[23];
11402 uDstOut.au8[31] = uSrc2.au8[23];
11403 *puDst = uDstOut;
11404}
11405
11406
11407/*
11408 * PUNPCKLBW - low words -> dwords
11409 */
11410#ifdef IEM_WITHOUT_ASSEMBLY
11411
11412IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11413{
11414 RTUINT64U const uSrc2 = { *puSrc };
11415 RTUINT64U const uSrc1 = { *puDst };
11416 ASMCompilerBarrier();
11417 RTUINT64U uDstOut;
11418 uDstOut.au16[0] = uSrc1.au16[0];
11419 uDstOut.au16[1] = uSrc2.au16[0];
11420 uDstOut.au16[2] = uSrc1.au16[1];
11421 uDstOut.au16[3] = uSrc2.au16[1];
11422 *puDst = uDstOut.u;
11423}
11424
11425
11426IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11427{
11428 RTUINT128U const uSrc2 = *puSrc;
11429 RTUINT128U const uSrc1 = *puDst;
11430 ASMCompilerBarrier();
11431 RTUINT128U uDstOut;
11432 uDstOut.au16[0] = uSrc1.au16[0];
11433 uDstOut.au16[1] = uSrc2.au16[0];
11434 uDstOut.au16[2] = uSrc1.au16[1];
11435 uDstOut.au16[3] = uSrc2.au16[1];
11436 uDstOut.au16[4] = uSrc1.au16[2];
11437 uDstOut.au16[5] = uSrc2.au16[2];
11438 uDstOut.au16[6] = uSrc1.au16[3];
11439 uDstOut.au16[7] = uSrc2.au16[3];
11440 *puDst = uDstOut;
11441}
11442
11443#endif
11444
11445IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11446{
11447 RTUINT128U const uSrc2 = *puSrc2;
11448 RTUINT128U const uSrc1 = *puSrc1;
11449 ASMCompilerBarrier();
11450 RTUINT128U uDstOut;
11451 uDstOut.au16[0] = uSrc1.au16[0];
11452 uDstOut.au16[1] = uSrc2.au16[0];
11453 uDstOut.au16[2] = uSrc1.au16[1];
11454 uDstOut.au16[3] = uSrc2.au16[1];
11455 uDstOut.au16[4] = uSrc1.au16[2];
11456 uDstOut.au16[5] = uSrc2.au16[2];
11457 uDstOut.au16[6] = uSrc1.au16[3];
11458 uDstOut.au16[7] = uSrc2.au16[3];
11459 *puDst = uDstOut;
11460}
11461
11462
11463IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11464{
11465 RTUINT256U const uSrc2 = *puSrc2;
11466 RTUINT256U const uSrc1 = *puSrc1;
11467 ASMCompilerBarrier();
11468 RTUINT256U uDstOut;
11469 uDstOut.au16[0] = uSrc1.au16[0];
11470 uDstOut.au16[1] = uSrc2.au16[0];
11471 uDstOut.au16[2] = uSrc1.au16[1];
11472 uDstOut.au16[3] = uSrc2.au16[1];
11473 uDstOut.au16[4] = uSrc1.au16[2];
11474 uDstOut.au16[5] = uSrc2.au16[2];
11475 uDstOut.au16[6] = uSrc1.au16[3];
11476 uDstOut.au16[7] = uSrc2.au16[3];
11477
11478 uDstOut.au16[8] = uSrc1.au16[8];
11479 uDstOut.au16[9] = uSrc2.au16[8];
11480 uDstOut.au16[10] = uSrc1.au16[9];
11481 uDstOut.au16[11] = uSrc2.au16[9];
11482 uDstOut.au16[12] = uSrc1.au16[10];
11483 uDstOut.au16[13] = uSrc2.au16[10];
11484 uDstOut.au16[14] = uSrc1.au16[11];
11485 uDstOut.au16[15] = uSrc2.au16[11];
11486 *puDst = uDstOut;
11487}
11488
11489
11490/*
11491 * PUNPCKLBW - low dwords -> qword(s)
11492 */
11493#ifdef IEM_WITHOUT_ASSEMBLY
11494
11495IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11496{
11497 RTUINT64U const uSrc2 = { *puSrc };
11498 RTUINT64U const uSrc1 = { *puDst };
11499 ASMCompilerBarrier();
11500 RTUINT64U uDstOut;
11501 uDstOut.au32[0] = uSrc1.au32[0];
11502 uDstOut.au32[1] = uSrc2.au32[0];
11503 *puDst = uDstOut.u;
11504}
11505
11506
11507IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11508{
11509 RTUINT128U const uSrc2 = *puSrc;
11510 RTUINT128U const uSrc1 = *puDst;
11511 ASMCompilerBarrier();
11512 RTUINT128U uDstOut;
11513 uDstOut.au32[0] = uSrc1.au32[0];
11514 uDstOut.au32[1] = uSrc2.au32[0];
11515 uDstOut.au32[2] = uSrc1.au32[1];
11516 uDstOut.au32[3] = uSrc2.au32[1];
11517 *puDst = uDstOut;
11518}
11519
11520#endif
11521
11522IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11523{
11524 RTUINT128U const uSrc2 = *puSrc2;
11525 RTUINT128U const uSrc1 = *puSrc1;
11526 ASMCompilerBarrier();
11527 RTUINT128U uDstOut;
11528 uDstOut.au32[0] = uSrc1.au32[0];
11529 uDstOut.au32[1] = uSrc2.au32[0];
11530 uDstOut.au32[2] = uSrc1.au32[1];
11531 uDstOut.au32[3] = uSrc2.au32[1];
11532 *puDst = uDstOut;
11533}
11534
11535
11536IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11537{
11538 RTUINT256U const uSrc2 = *puSrc2;
11539 RTUINT256U const uSrc1 = *puSrc1;
11540 ASMCompilerBarrier();
11541 RTUINT256U uDstOut;
11542 uDstOut.au32[0] = uSrc1.au32[0];
11543 uDstOut.au32[1] = uSrc2.au32[0];
11544 uDstOut.au32[2] = uSrc1.au32[1];
11545 uDstOut.au32[3] = uSrc2.au32[1];
11546
11547 uDstOut.au32[4] = uSrc1.au32[4];
11548 uDstOut.au32[5] = uSrc2.au32[4];
11549 uDstOut.au32[6] = uSrc1.au32[5];
11550 uDstOut.au32[7] = uSrc2.au32[5];
11551 *puDst = uDstOut;
11552}
11553
11554
11555/*
11556 * PUNPCKLQDQ -> Low qwords -> double qword(s).
11557 */
11558#ifdef IEM_WITHOUT_ASSEMBLY
11559IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11560{
11561 RTUINT128U const uSrc2 = *puSrc;
11562 RTUINT128U const uSrc1 = *puDst;
11563 ASMCompilerBarrier();
11564 RTUINT128U uDstOut;
11565 uDstOut.au64[0] = uSrc1.au64[0];
11566 uDstOut.au64[1] = uSrc2.au64[0];
11567 *puDst = uDstOut;
11568}
11569#endif
11570
11571
11572IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11573{
11574 RTUINT128U const uSrc2 = *puSrc2;
11575 RTUINT128U const uSrc1 = *puSrc1;
11576 ASMCompilerBarrier();
11577 RTUINT128U uDstOut;
11578 uDstOut.au64[0] = uSrc1.au64[0];
11579 uDstOut.au64[1] = uSrc2.au64[0];
11580 *puDst = uDstOut;
11581}
11582
11583
11584IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11585{
11586 RTUINT256U const uSrc2 = *puSrc2;
11587 RTUINT256U const uSrc1 = *puSrc1;
11588 ASMCompilerBarrier();
11589 RTUINT256U uDstOut;
11590 uDstOut.au64[0] = uSrc1.au64[0];
11591 uDstOut.au64[1] = uSrc2.au64[0];
11592
11593 uDstOut.au64[2] = uSrc1.au64[2];
11594 uDstOut.au64[3] = uSrc2.au64[2];
11595 *puDst = uDstOut;
11596}
11597
11598
11599/*
11600 * PACKSSWB - signed words -> signed bytes
11601 */
11602
11603#ifdef IEM_WITHOUT_ASSEMBLY
11604
11605IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11606{
11607 RTUINT64U const uSrc2 = { *puSrc };
11608 RTUINT64U const uSrc1 = { *puDst };
11609 ASMCompilerBarrier();
11610 RTUINT64U uDstOut;
11611 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11612 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11613 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11614 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11615 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11616 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11617 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11618 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11619 *puDst = uDstOut.u;
11620}
11621
11622
11623IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11624{
11625 RTUINT128U const uSrc2 = *puSrc;
11626 RTUINT128U const uSrc1 = *puDst;
11627 ASMCompilerBarrier();
11628 RTUINT128U uDstOut;
11629 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11630 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11631 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11632 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11633 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11634 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11635 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11636 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11637 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11638 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11639 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11640 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11641 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11642 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11643 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11644 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11645 *puDst = uDstOut;
11646}
11647
11648#endif
11649
11650IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11651{
11652 RTUINT128U const uSrc2 = *puSrc2;
11653 RTUINT128U const uSrc1 = *puSrc1;
11654 ASMCompilerBarrier();
11655 RTUINT128U uDstOut;
11656 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11657 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11658 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11659 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11660 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11661 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11662 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11663 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11664 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11665 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11666 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11667 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11668 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11669 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11670 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11671 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11672 *puDst = uDstOut;
11673}
11674
11675
11676IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11677{
11678 RTUINT256U const uSrc2 = *puSrc2;
11679 RTUINT256U const uSrc1 = *puSrc1;
11680 ASMCompilerBarrier();
11681 RTUINT256U uDstOut;
11682 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11683 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11684 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11685 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11686 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11687 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11688 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11689 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11690 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11691 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11692 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11693 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11694 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11695 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11696 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11697 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11698
11699 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
11700 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
11701 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
11702 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
11703 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
11704 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
11705 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
11706 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
11707 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
11708 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
11709 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
11710 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
11711 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
11712 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
11713 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
11714 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
11715 *puDst = uDstOut;
11716}
11717
11718
11719/*
11720 * PACKUSWB - signed words -> unsigned bytes
11721 */
11722#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
11723 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
11724 ? (uint8_t)(a_iWord) \
11725 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
11726
11727#ifdef IEM_WITHOUT_ASSEMBLY
11728
11729IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11730{
11731 RTUINT64U const uSrc2 = { *puSrc };
11732 RTUINT64U const uSrc1 = { *puDst };
11733 ASMCompilerBarrier();
11734 RTUINT64U uDstOut;
11735 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11736 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11737 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11738 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11739 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11740 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11741 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11742 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11743 *puDst = uDstOut.u;
11744}
11745
11746
11747IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11748{
11749 RTUINT128U const uSrc2 = *puSrc;
11750 RTUINT128U const uSrc1 = *puDst;
11751 ASMCompilerBarrier();
11752 RTUINT128U uDstOut;
11753 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11754 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11755 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11756 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11757 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11758 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11759 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11760 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11761 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11762 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11763 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11764 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11765 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11766 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11767 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11768 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11769 *puDst = uDstOut;
11770}
11771
11772#endif
11773
11774IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11775{
11776 RTUINT128U const uSrc2 = *puSrc2;
11777 RTUINT128U const uSrc1 = *puSrc1;
11778 ASMCompilerBarrier();
11779 RTUINT128U uDstOut;
11780 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11781 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11782 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11783 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11784 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11785 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11786 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11787 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11788 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11789 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11790 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11791 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11792 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11793 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11794 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11795 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11796 *puDst = uDstOut;
11797}
11798
11799
11800IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11801{
11802 RTUINT256U const uSrc2 = *puSrc2;
11803 RTUINT256U const uSrc1 = *puSrc1;
11804 ASMCompilerBarrier();
11805 RTUINT256U uDstOut;
11806 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11807 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11808 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11809 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11810 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11811 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11812 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11813 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11814 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11815 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11816 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11817 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11818 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11819 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11820 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11821 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11822
11823 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
11824 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
11825 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
11826 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
11827 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
11828 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
11829 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
11830 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
11831 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
11832 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
11833 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
11834 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
11835 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
11836 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
11837 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
11838 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
11839 *puDst = uDstOut;
11840}
11841
11842
11843/*
11844 * PACKSSDW - signed dwords -> signed words
11845 */
11846
11847#ifdef IEM_WITHOUT_ASSEMBLY
11848
11849IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11850{
11851 RTUINT64U const uSrc2 = { *puSrc };
11852 RTUINT64U const uSrc1 = { *puDst };
11853 ASMCompilerBarrier();
11854 RTUINT64U uDstOut;
11855 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11856 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11857 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11858 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11859 *puDst = uDstOut.u;
11860}
11861
11862
11863IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11864{
11865 RTUINT128U const uSrc2 = *puSrc;
11866 RTUINT128U const uSrc1 = *puDst;
11867 ASMCompilerBarrier();
11868 RTUINT128U uDstOut;
11869 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11870 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11871 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11872 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11873 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11874 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11875 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11876 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11877 *puDst = uDstOut;
11878}
11879
11880#endif
11881
11882IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11883{
11884 RTUINT128U const uSrc2 = *puSrc2;
11885 RTUINT128U const uSrc1 = *puSrc1;
11886 ASMCompilerBarrier();
11887 RTUINT128U uDstOut;
11888 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11889 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11890 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11891 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11892 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11893 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11894 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11895 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11896 *puDst = uDstOut;
11897}
11898
11899
11900IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11901{
11902 RTUINT256U const uSrc2 = *puSrc2;
11903 RTUINT256U const uSrc1 = *puSrc1;
11904 ASMCompilerBarrier();
11905 RTUINT256U uDstOut;
11906 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11907 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11908 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11909 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11910 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11911 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11912 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11913 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11914
11915 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
11916 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
11917 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
11918 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
11919 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
11920 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
11921 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
11922 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
11923 *puDst = uDstOut;
11924}
11925
11926
11927/*
11928 * PACKUSDW - signed dwords -> unsigned words
11929 */
11930#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
11931 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
11932 ? (uint16_t)(a_iDword) \
11933 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
11934
11935#ifdef IEM_WITHOUT_ASSEMBLY
11936IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11937{
11938 RTUINT128U const uSrc2 = *puSrc;
11939 RTUINT128U const uSrc1 = *puDst;
11940 ASMCompilerBarrier();
11941 RTUINT128U uDstOut;
11942 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11943 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11944 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11945 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11946 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11947 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11948 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11949 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11950 *puDst = uDstOut;
11951}
11952#endif
11953
11954IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11955{
11956 RTUINT128U const uSrc2 = *puSrc2;
11957 RTUINT128U const uSrc1 = *puSrc1;
11958 ASMCompilerBarrier();
11959 RTUINT128U uDstOut;
11960 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11961 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11962 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11963 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11964 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11965 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11966 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11967 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11968 *puDst = uDstOut;
11969}
11970
11971
11972IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11973{
11974 RTUINT256U const uSrc2 = *puSrc2;
11975 RTUINT256U const uSrc1 = *puSrc1;
11976 ASMCompilerBarrier();
11977 RTUINT256U uDstOut;
11978 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11979 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11980 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11981 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11982 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11983 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11984 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11985 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11986
11987 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
11988 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
11989 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
11990 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
11991 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
11992 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
11993 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
11994 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
11995 *puDst = uDstOut;
11996}
11997
11998
11999/*
12000 * [V]PABSB / [V]PABSW / [V]PABSD
12001 */
12002
12003IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12004{
12005 RTUINT64U const uSrc = { *puSrc };
12006 RTUINT64U uDstOut = { 0 };
12007
12008 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12009 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12010 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12011 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12012 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12013 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12014 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12015 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12016 *puDst = uDstOut.u;
12017 RT_NOREF(pFpuState);
12018}
12019
12020
12021IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12022{
12023 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12024 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12025 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12026 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12027 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12028 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12029 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12030 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12031 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12032 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12033 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12034 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12035 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12036 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12037 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12038 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12039 RT_NOREF(pFpuState);
12040}
12041
12042
12043IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12044{
12045 RTUINT64U const uSrc = { *puSrc };
12046 RTUINT64U uDstOut = { 0 };
12047
12048 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12049 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12050 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12051 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12052 *puDst = uDstOut.u;
12053 RT_NOREF(pFpuState);
12054}
12055
12056
12057IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12058{
12059 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12060 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12061 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12062 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12063 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12064 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12065 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12066 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12067 RT_NOREF(pFpuState);
12068}
12069
12070
12071IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12072{
12073 RTUINT64U const uSrc = { *puSrc };
12074 RTUINT64U uDstOut = { 0 };
12075
12076 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12077 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12078 *puDst = uDstOut.u;
12079 RT_NOREF(pFpuState);
12080}
12081
12082
12083IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12084{
12085 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12086 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12087 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12088 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12089 RT_NOREF(pFpuState);
12090}
12091
12092
12093IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12094{
12095 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12096 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12097 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12098 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12099 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12100 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12101 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12102 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12103 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12104 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12105 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12106 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12107 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12108 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12109 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12110 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12111}
12112
12113
12114IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12115{
12116 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12117 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12118 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12119 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12120 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12121 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12122 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12123 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12124 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12125 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12126 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12127 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12128 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12129 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12130 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12131 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12132 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12133 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12134 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12135 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12136 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12137 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12138 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12139 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12140 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12141 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12142 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12143 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12144 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12145 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12146 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12147 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12148}
12149
12150
12151IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12152{
12153 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12154 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12155 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12156 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12157 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12158 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12159 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12160 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12161}
12162
12163
12164IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12165{
12166 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12167 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12168 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12169 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12170 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12171 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12172 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12173 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12174 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12175 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12176 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12177 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12178 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12179 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12180 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12181 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12182}
12183
12184
12185IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12186{
12187 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12188 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12189 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12190 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12191}
12192
12193
12194IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12195{
12196 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12197 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12198 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12199 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12200 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12201 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12202 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12203 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12204}
12205
12206
12207/*
12208 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12209 */
12210IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12211{
12212 RTUINT64U uSrc1 = { *puDst };
12213 RTUINT64U uSrc2 = { *puSrc };
12214 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12215
12216 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12217 {
12218 if (uSrc2.ai8[i] < 0)
12219 uDst.ai8[i] = -uSrc1.ai8[i];
12220 else if (uSrc2.ai8[i] == 0)
12221 uDst.ai8[i] = 0;
12222 else /* uSrc2.ai8[i] > 0 */
12223 uDst.ai8[i] = uSrc1.ai8[i];
12224 }
12225
12226 *puDst = uDst.u;
12227 RT_NOREF(pFpuState);
12228}
12229
12230
12231IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12232{
12233 RTUINT128U uSrc1 = *puDst;
12234
12235 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12236 {
12237 if (puSrc->ai8[i] < 0)
12238 puDst->ai8[i] = -uSrc1.ai8[i];
12239 else if (puSrc->ai8[i] == 0)
12240 puDst->ai8[i] = 0;
12241 else /* puSrc->ai8[i] > 0 */
12242 puDst->ai8[i] = uSrc1.ai8[i];
12243 }
12244
12245 RT_NOREF(pFpuState);
12246}
12247
12248
12249IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12250{
12251 RTUINT64U uSrc1 = { *puDst };
12252 RTUINT64U uSrc2 = { *puSrc };
12253 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12254
12255 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12256 {
12257 if (uSrc2.ai16[i] < 0)
12258 uDst.ai16[i] = -uSrc1.ai16[i];
12259 else if (uSrc2.ai16[i] == 0)
12260 uDst.ai16[i] = 0;
12261 else /* uSrc2.ai16[i] > 0 */
12262 uDst.ai16[i] = uSrc1.ai16[i];
12263 }
12264
12265 *puDst = uDst.u;
12266 RT_NOREF(pFpuState);
12267}
12268
12269
12270IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12271{
12272 RTUINT128U uSrc1 = *puDst;
12273
12274 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12275 {
12276 if (puSrc->ai16[i] < 0)
12277 puDst->ai16[i] = -uSrc1.ai16[i];
12278 else if (puSrc->ai16[i] == 0)
12279 puDst->ai16[i] = 0;
12280 else /* puSrc->ai16[i] > 0 */
12281 puDst->ai16[i] = uSrc1.ai16[i];
12282 }
12283
12284 RT_NOREF(pFpuState);
12285}
12286
12287
12288IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12289{
12290 RTUINT64U uSrc1 = { *puDst };
12291 RTUINT64U uSrc2 = { *puSrc };
12292 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12293
12294 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12295 {
12296 if (uSrc2.ai32[i] < 0)
12297 uDst.ai32[i] = -uSrc1.ai32[i];
12298 else if (uSrc2.ai32[i] == 0)
12299 uDst.ai32[i] = 0;
12300 else /* uSrc2.ai32[i] > 0 */
12301 uDst.ai32[i] = uSrc1.ai32[i];
12302 }
12303
12304 *puDst = uDst.u;
12305 RT_NOREF(pFpuState);
12306}
12307
12308
12309IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12310{
12311 RTUINT128U uSrc1 = *puDst;
12312
12313 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12314 {
12315 if (puSrc->ai32[i] < 0)
12316 puDst->ai32[i] = -uSrc1.ai32[i];
12317 else if (puSrc->ai32[i] == 0)
12318 puDst->ai32[i] = 0;
12319 else /* puSrc->ai32[i] > 0 */
12320 puDst->ai32[i] = uSrc1.ai32[i];
12321 }
12322
12323 RT_NOREF(pFpuState);
12324}
12325
12326
12327IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12328{
12329 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12330 {
12331 if (puSrc2->ai8[i] < 0)
12332 puDst->ai8[i] = -puSrc1->ai8[i];
12333 else if (puSrc2->ai8[i] == 0)
12334 puDst->ai8[i] = 0;
12335 else /* puSrc2->ai8[i] > 0 */
12336 puDst->ai8[i] = puSrc1->ai8[i];
12337 }
12338}
12339
12340
12341IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12342{
12343 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12344 {
12345 if (puSrc2->ai8[i] < 0)
12346 puDst->ai8[i] = -puSrc1->ai8[i];
12347 else if (puSrc2->ai8[i] == 0)
12348 puDst->ai8[i] = 0;
12349 else /* puSrc2->ai8[i] > 0 */
12350 puDst->ai8[i] = puSrc1->ai8[i];
12351 }
12352}
12353
12354
12355IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12356{
12357 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12358 {
12359 if (puSrc2->ai16[i] < 0)
12360 puDst->ai16[i] = -puSrc1->ai16[i];
12361 else if (puSrc2->ai16[i] == 0)
12362 puDst->ai16[i] = 0;
12363 else /* puSrc2->ai16[i] > 0 */
12364 puDst->ai16[i] = puSrc1->ai16[i];
12365 }
12366}
12367
12368
12369IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12370{
12371 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12372 {
12373 if (puSrc2->ai16[i] < 0)
12374 puDst->ai16[i] = -puSrc1->ai16[i];
12375 else if (puSrc2->ai16[i] == 0)
12376 puDst->ai16[i] = 0;
12377 else /* puSrc2->ai16[i] > 0 */
12378 puDst->ai16[i] = puSrc1->ai16[i];
12379 }
12380}
12381
12382
12383IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12384{
12385 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12386 {
12387 if (puSrc2->ai32[i] < 0)
12388 puDst->ai32[i] = -puSrc1->ai32[i];
12389 else if (puSrc2->ai32[i] == 0)
12390 puDst->ai32[i] = 0;
12391 else /* puSrc2->ai32[i] > 0 */
12392 puDst->ai32[i] = puSrc1->ai32[i];
12393 }
12394}
12395
12396
12397IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12398{
12399 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12400 {
12401 if (puSrc2->ai32[i] < 0)
12402 puDst->ai32[i] = -puSrc1->ai32[i];
12403 else if (puSrc2->ai32[i] == 0)
12404 puDst->ai32[i] = 0;
12405 else /* puSrc2->ai32[i] > 0 */
12406 puDst->ai32[i] = puSrc1->ai32[i];
12407 }
12408}
12409
12410
12411/*
12412 * PHADDW / VPHADDW / PHADDD / VPHADDD
12413 */
12414IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12415{
12416 RTUINT64U uSrc1 = { *puDst };
12417 RTUINT64U uSrc2 = { *puSrc };
12418 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12419
12420 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12421 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12422 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
12423 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
12424 *puDst = uDst.u;
12425 RT_NOREF(pFpuState);
12426}
12427
12428
12429IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12430{
12431 RTUINT128U uSrc1 = *puDst;
12432
12433 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12434 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12435 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
12436 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
12437
12438 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
12439 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
12440 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
12441 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
12442 RT_NOREF(pFpuState);
12443}
12444
12445
12446IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12447{
12448 RTUINT64U uSrc1 = { *puDst };
12449 RTUINT64U uSrc2 = { *puSrc };
12450 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12451
12452 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12453 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
12454 *puDst = uDst.u;
12455 RT_NOREF(pFpuState);
12456}
12457
12458
12459IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12460{
12461 RTUINT128U uSrc1 = *puDst;
12462
12463 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12464 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
12465
12466 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
12467 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
12468 RT_NOREF(pFpuState);
12469}
12470
12471
12472IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12473{
12474 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12475
12476 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
12477 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
12478 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
12479 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
12480
12481 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
12482 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
12483 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
12484 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
12485
12486 puDst->au64[0] = uDst.au64[0];
12487 puDst->au64[1] = uDst.au64[1];
12488}
12489
12490
12491IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12492{
12493 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12494
12495 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
12496 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
12497 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
12498 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
12499 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
12500 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
12501 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
12502 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
12503
12504 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
12505 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
12506 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
12507 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
12508 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
12509 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
12510 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
12511 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
12512
12513 puDst->au64[0] = uDst.au64[0];
12514 puDst->au64[1] = uDst.au64[1];
12515 puDst->au64[2] = uDst.au64[2];
12516 puDst->au64[3] = uDst.au64[3];
12517}
12518
12519
12520IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12521{
12522 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12523
12524 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
12525 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
12526
12527 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
12528 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
12529
12530 puDst->au64[0] = uDst.au64[0];
12531 puDst->au64[1] = uDst.au64[1];
12532}
12533
12534
12535IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12536{
12537 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12538
12539 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
12540 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
12541 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
12542 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
12543
12544 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
12545 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
12546 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
12547 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
12548
12549 puDst->au64[0] = uDst.au64[0];
12550 puDst->au64[1] = uDst.au64[1];
12551 puDst->au64[2] = uDst.au64[2];
12552 puDst->au64[3] = uDst.au64[3];
12553}
12554
12555
12556/*
12557 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
12558 */
12559IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12560{
12561 RTUINT64U uSrc1 = { *puDst };
12562 RTUINT64U uSrc2 = { *puSrc };
12563 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12564
12565 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
12566 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
12567 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
12568 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
12569 *puDst = uDst.u;
12570 RT_NOREF(pFpuState);
12571}
12572
12573
12574IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12575{
12576 RTUINT128U uSrc1 = *puDst;
12577
12578 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
12579 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
12580 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
12581 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
12582
12583 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
12584 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
12585 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
12586 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
12587 RT_NOREF(pFpuState);
12588}
12589
12590
12591IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12592{
12593 RTUINT64U uSrc1 = { *puDst };
12594 RTUINT64U uSrc2 = { *puSrc };
12595 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12596
12597 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
12598 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
12599 *puDst = uDst.u;
12600 RT_NOREF(pFpuState);
12601}
12602
12603
12604IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12605{
12606 RTUINT128U uSrc1 = *puDst;
12607
12608 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
12609 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
12610
12611 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
12612 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
12613 RT_NOREF(pFpuState);
12614}
12615
12616
12617IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12618{
12619 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12620
12621 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
12622 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
12623 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
12624 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
12625
12626 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
12627 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
12628 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
12629 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
12630
12631 puDst->au64[0] = uDst.au64[0];
12632 puDst->au64[1] = uDst.au64[1];
12633}
12634
12635
12636IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12637{
12638 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12639
12640 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
12641 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
12642 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
12643 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
12644 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
12645 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
12646 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
12647 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
12648
12649 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
12650 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
12651 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
12652 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
12653 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
12654 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
12655 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
12656 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
12657
12658 puDst->au64[0] = uDst.au64[0];
12659 puDst->au64[1] = uDst.au64[1];
12660 puDst->au64[2] = uDst.au64[2];
12661 puDst->au64[3] = uDst.au64[3];
12662}
12663
12664
12665IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12666{
12667 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12668
12669 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
12670 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
12671
12672 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
12673 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
12674
12675 puDst->au64[0] = uDst.au64[0];
12676 puDst->au64[1] = uDst.au64[1];
12677}
12678
12679
12680IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12681{
12682 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12683
12684 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
12685 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
12686 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
12687 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
12688
12689 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
12690 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
12691 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
12692 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
12693
12694 puDst->au64[0] = uDst.au64[0];
12695 puDst->au64[1] = uDst.au64[1];
12696 puDst->au64[2] = uDst.au64[2];
12697 puDst->au64[3] = uDst.au64[3];
12698}
12699
12700
12701/*
12702 * PHADDSW / VPHADDSW
12703 */
12704IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12705{
12706 RTUINT64U uSrc1 = { *puDst };
12707 RTUINT64U uSrc2 = { *puSrc };
12708 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12709
12710 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
12711 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
12712 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
12713 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
12714 *puDst = uDst.u;
12715 RT_NOREF(pFpuState);
12716}
12717
12718
12719IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12720{
12721 RTUINT128U uSrc1 = *puDst;
12722
12723 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
12724 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
12725 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
12726 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
12727
12728 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
12729 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
12730 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
12731 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
12732 RT_NOREF(pFpuState);
12733}
12734
12735
12736IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12737{
12738 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12739
12740 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
12741 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
12742 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
12743 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
12744
12745 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
12746 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
12747 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
12748 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
12749
12750 puDst->au64[0] = uDst.au64[0];
12751 puDst->au64[1] = uDst.au64[1];
12752}
12753
12754
12755IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12756{
12757 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12758
12759 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
12760 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
12761 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
12762 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
12763 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
12764 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
12765 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
12766 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
12767
12768 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
12769 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
12770 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
12771 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
12772 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
12773 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
12774 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
12775 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
12776
12777 puDst->au64[0] = uDst.au64[0];
12778 puDst->au64[1] = uDst.au64[1];
12779 puDst->au64[2] = uDst.au64[2];
12780 puDst->au64[3] = uDst.au64[3];
12781}
12782
12783
12784/*
12785 * PHSUBSW / VPHSUBSW
12786 */
12787IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12788{
12789 RTUINT64U uSrc1 = { *puDst };
12790 RTUINT64U uSrc2 = { *puSrc };
12791 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12792
12793 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
12794 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
12795 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
12796 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
12797 *puDst = uDst.u;
12798 RT_NOREF(pFpuState);
12799}
12800
12801
12802IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12803{
12804 RTUINT128U uSrc1 = *puDst;
12805
12806 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
12807 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
12808 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
12809 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
12810
12811 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
12812 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
12813 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
12814 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
12815 RT_NOREF(pFpuState);
12816}
12817
12818
12819IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12820{
12821 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12822
12823 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
12824 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
12825 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
12826 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
12827
12828 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
12829 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
12830 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
12831 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
12832
12833 puDst->au64[0] = uDst.au64[0];
12834 puDst->au64[1] = uDst.au64[1];
12835}
12836
12837
12838IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12839{
12840 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12841
12842 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
12843 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
12844 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
12845 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
12846 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
12847 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
12848 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
12849 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
12850
12851 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
12852 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
12853 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
12854 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
12855 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
12856 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
12857 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
12858 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
12859
12860 puDst->au64[0] = uDst.au64[0];
12861 puDst->au64[1] = uDst.au64[1];
12862 puDst->au64[2] = uDst.au64[2];
12863 puDst->au64[3] = uDst.au64[3];
12864}
12865
12866
12867/*
12868 * PMADDUBSW / VPMADDUBSW
12869 */
12870IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12871{
12872 RTUINT64U uSrc1 = { *puDst };
12873 RTUINT64U uSrc2 = { *puSrc };
12874 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12875
12876 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
12877 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
12878 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
12879 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
12880 *puDst = uDst.u;
12881 RT_NOREF(pFpuState);
12882}
12883
12884
12885IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12886{
12887 RTUINT128U uSrc1 = *puDst;
12888
12889 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
12890 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
12891 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
12892 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
12893 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
12894 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
12895 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
12896 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
12897 RT_NOREF(pFpuState);
12898}
12899
12900
12901IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12902{
12903 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12904
12905 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
12906 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
12907 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
12908 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
12909 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
12910 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
12911 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
12912 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
12913
12914 puDst->au64[0] = uDst.au64[0];
12915 puDst->au64[1] = uDst.au64[1];
12916}
12917
12918
12919IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12920{
12921 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12922
12923 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
12924 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
12925 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
12926 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
12927 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
12928 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
12929 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
12930 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
12931 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
12932 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
12933 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
12934 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
12935 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
12936 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
12937 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
12938 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
12939
12940 puDst->au64[0] = uDst.au64[0];
12941 puDst->au64[1] = uDst.au64[1];
12942 puDst->au64[2] = uDst.au64[2];
12943 puDst->au64[3] = uDst.au64[3];
12944}
12945
12946
12947/*
12948 * PMULHRSW / VPMULHRSW
12949 */
12950#define DO_PMULHRSW(a_Src1, a_Src2) \
12951 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
12952
12953IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12954{
12955 RTUINT64U uSrc1 = { *puDst };
12956 RTUINT64U uSrc2 = { *puSrc };
12957 RTUINT64U uDst;
12958
12959 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
12960 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
12961 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
12962 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
12963 *puDst = uDst.u;
12964 RT_NOREF(pFpuState);
12965}
12966
12967
12968IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12969{
12970 RTUINT128U uSrc1 = *puDst;
12971
12972 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
12973 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
12974 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
12975 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
12976 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
12977 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
12978 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
12979 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
12980 RT_NOREF(pFpuState);
12981}
12982
12983
12984IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12985{
12986 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12987
12988 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
12989 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
12990 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
12991 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
12992 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
12993 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
12994 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
12995 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
12996
12997 puDst->au64[0] = uDst.au64[0];
12998 puDst->au64[1] = uDst.au64[1];
12999}
13000
13001
13002IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13003{
13004 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13005
13006 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13007 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13008 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13009 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13010 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13011 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13012 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13013 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13014 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13015 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13016 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13017 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13018 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13019 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13020 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13021 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13022
13023 puDst->au64[0] = uDst.au64[0];
13024 puDst->au64[1] = uDst.au64[1];
13025 puDst->au64[2] = uDst.au64[2];
13026 puDst->au64[3] = uDst.au64[3];
13027}
13028
13029
13030/*
13031 * PSADBW / VPSADBW
13032 */
13033#ifdef IEM_WITHOUT_ASSEMBLY
13034
13035IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13036{
13037 RTUINT64U uSrc1 = { *puDst };
13038 RTUINT64U uSrc2 = { *puSrc };
13039 RTUINT64U uDst;
13040 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13041 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13042 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13043 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13044 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13045 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13046 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13047 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13048
13049 uDst.au64[0] = 0;
13050 uDst.au16[0] = uSum;
13051 *puDst = uDst.u;
13052}
13053
13054
13055IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13056{
13057 RTUINT128U uSrc1 = *puDst;
13058
13059 puDst->au64[0] = 0;
13060 puDst->au64[1] = 0;
13061
13062 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13063 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13064 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13065 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13066 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13067 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13068 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13069 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13070 puDst->au16[0] = uSum;
13071
13072 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13073 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13074 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13075 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13076 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13077 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13078 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13079 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13080 puDst->au16[4] = uSum;
13081}
13082
13083#endif
13084
13085IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13086{
13087 RTUINT128U uSrc1 = *puSrc1;
13088 RTUINT128U uSrc2 = *puSrc2;
13089
13090 puDst->au64[0] = 0;
13091 puDst->au64[1] = 0;
13092
13093 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13094 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13095 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13096 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13097 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13098 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13099 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13100 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13101 puDst->au16[0] = uSum;
13102
13103 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13104 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13105 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13106 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13107 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13108 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13109 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13110 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13111 puDst->au16[4] = uSum;
13112}
13113
13114IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13115{
13116 RTUINT256U uSrc1 = *puSrc1;
13117 RTUINT256U uSrc2 = *puSrc2;
13118
13119 puDst->au64[0] = 0;
13120 puDst->au64[1] = 0;
13121 puDst->au64[2] = 0;
13122 puDst->au64[3] = 0;
13123
13124 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13125 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13126 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13127 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13128 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13129 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13130 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13131 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13132 puDst->au16[0] = uSum;
13133
13134 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13135 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13136 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13137 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13138 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13139 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13140 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13141 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13142 puDst->au16[4] = uSum;
13143
13144 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13145 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13146 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13147 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13148 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13149 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13150 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13151 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13152 puDst->au16[8] = uSum;
13153
13154 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13155 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13156 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13157 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13158 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13159 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13160 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13161 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13162 puDst->au16[12] = uSum;
13163}
13164
13165
13166/*
13167 * PMULDQ / VPMULDQ
13168 */
13169IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13170{
13171 RTUINT128U uSrc1 = *puDst;
13172
13173 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13174 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13175}
13176
13177IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13178{
13179 RTUINT128U uSrc1 = *puSrc1;
13180 RTUINT128U uSrc2 = *puSrc2;
13181
13182 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13183 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13184}
13185
13186IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13187{
13188 RTUINT256U uSrc1 = *puSrc1;
13189 RTUINT256U uSrc2 = *puSrc2;
13190
13191 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13192 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13193 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13194 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13195}
13196
13197
13198/*
13199 * PMULUDQ / VPMULUDQ
13200 */
13201#ifdef IEM_WITHOUT_ASSEMBLY
13202
13203IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13204{
13205 RTUINT64U uSrc1 = { *puDst };
13206 RTUINT64U uSrc2 = { *puSrc };
13207 ASMCompilerBarrier();
13208 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13209 RT_NOREF(pFpuState);
13210}
13211
13212
13213IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13214{
13215 RTUINT128U uSrc1 = *puDst;
13216 RTUINT128U uSrc2 = *puSrc;
13217 ASMCompilerBarrier();
13218 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13219 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13220 RT_NOREF(pFpuState);
13221}
13222
13223#endif
13224
13225IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13226{
13227 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13228 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13229 ASMCompilerBarrier();
13230 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13231 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13232}
13233
13234
13235IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13236{
13237 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13238 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13239 ASMCompilerBarrier();
13240 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13241 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13242 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13243 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13244}
13245
13246
13247/*
13248 * UNPCKLPS / VUNPCKLPS
13249 */
13250#ifdef IEM_WITHOUT_ASSEMBLY
13251IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13252{
13253 RTUINT128U uSrc1 = *puDst;
13254 RTUINT128U uSrc2 = *puSrc;
13255 ASMCompilerBarrier();
13256 puDst->au32[0] = uSrc1.au32[0];
13257 puDst->au32[1] = uSrc2.au32[0];
13258 puDst->au32[2] = uSrc1.au32[1];
13259 puDst->au32[3] = uSrc2.au32[1];
13260}
13261
13262#endif
13263
13264IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13265{
13266 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13267 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13268 ASMCompilerBarrier();
13269 puDst->au32[0] = uSrc1.au32[0];
13270 puDst->au32[1] = uSrc2.au32[0];
13271 puDst->au32[2] = uSrc1.au32[1];
13272 puDst->au32[3] = uSrc2.au32[1];
13273}
13274
13275
13276IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13277{
13278 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13279 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13280 ASMCompilerBarrier();
13281 puDst->au32[0] = uSrc1.au32[0];
13282 puDst->au32[1] = uSrc2.au32[0];
13283 puDst->au32[2] = uSrc1.au32[1];
13284 puDst->au32[3] = uSrc2.au32[1];
13285
13286 puDst->au32[4] = uSrc1.au32[4];
13287 puDst->au32[5] = uSrc2.au32[4];
13288 puDst->au32[6] = uSrc1.au32[5];
13289 puDst->au32[7] = uSrc2.au32[5];
13290}
13291
13292
13293/*
13294 * UNPCKLPD / VUNPCKLPD
13295 */
13296#ifdef IEM_WITHOUT_ASSEMBLY
13297IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13298{
13299 RTUINT128U uSrc1 = *puDst;
13300 RTUINT128U uSrc2 = *puSrc;
13301 ASMCompilerBarrier();
13302 puDst->au64[0] = uSrc1.au64[0];
13303 puDst->au64[1] = uSrc2.au64[0];
13304}
13305
13306#endif
13307
13308IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13309{
13310 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13311 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13312 ASMCompilerBarrier();
13313 puDst->au64[0] = uSrc1.au64[0];
13314 puDst->au64[1] = uSrc2.au64[0];
13315}
13316
13317
13318IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13319{
13320 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13321 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13322 ASMCompilerBarrier();
13323 puDst->au64[0] = uSrc1.au64[0];
13324 puDst->au64[1] = uSrc2.au64[0];
13325 puDst->au64[2] = uSrc1.au64[2];
13326 puDst->au64[3] = uSrc2.au64[2];
13327}
13328
13329
13330/*
13331 * UNPCKHPS / VUNPCKHPS
13332 */
13333#ifdef IEM_WITHOUT_ASSEMBLY
13334IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13335{
13336 RTUINT128U uSrc1 = *puDst;
13337 RTUINT128U uSrc2 = *puSrc;
13338 ASMCompilerBarrier();
13339 puDst->au32[0] = uSrc1.au32[2];
13340 puDst->au32[1] = uSrc2.au32[2];
13341 puDst->au32[2] = uSrc1.au32[3];
13342 puDst->au32[3] = uSrc2.au32[3];
13343}
13344
13345#endif
13346
13347IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13348{
13349 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13350 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13351 ASMCompilerBarrier();
13352 puDst->au32[0] = uSrc1.au32[2];
13353 puDst->au32[1] = uSrc2.au32[2];
13354 puDst->au32[2] = uSrc1.au32[3];
13355 puDst->au32[3] = uSrc2.au32[3];
13356}
13357
13358
13359IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13360{
13361 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13362 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13363 ASMCompilerBarrier();
13364 puDst->au32[0] = uSrc1.au32[2];
13365 puDst->au32[1] = uSrc2.au32[2];
13366 puDst->au32[2] = uSrc1.au32[3];
13367 puDst->au32[3] = uSrc2.au32[3];
13368
13369 puDst->au32[4] = uSrc1.au32[6];
13370 puDst->au32[5] = uSrc2.au32[6];
13371 puDst->au32[6] = uSrc1.au32[7];
13372 puDst->au32[7] = uSrc2.au32[7];
13373}
13374
13375
13376/*
13377 * UNPCKHPD / VUNPCKHPD
13378 */
13379#ifdef IEM_WITHOUT_ASSEMBLY
13380IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13381{
13382 RTUINT128U uSrc1 = *puDst;
13383 RTUINT128U uSrc2 = *puSrc;
13384 ASMCompilerBarrier();
13385 puDst->au64[0] = uSrc1.au64[1];
13386 puDst->au64[1] = uSrc2.au64[1];
13387}
13388
13389#endif
13390
13391IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13392{
13393 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13394 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13395 ASMCompilerBarrier();
13396 puDst->au64[0] = uSrc1.au64[1];
13397 puDst->au64[1] = uSrc2.au64[1];
13398}
13399
13400
13401IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13402{
13403 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13404 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13405 ASMCompilerBarrier();
13406 puDst->au64[0] = uSrc1.au64[1];
13407 puDst->au64[1] = uSrc2.au64[1];
13408 puDst->au64[2] = uSrc1.au64[3];
13409 puDst->au64[3] = uSrc2.au64[3];
13410}
13411
13412
13413/*
13414 * CRC32 (SEE 4.2).
13415 */
13416
13417IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
13418{
13419 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13420}
13421
13422
13423IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
13424{
13425 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13426}
13427
13428IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
13429{
13430 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13431}
13432
13433IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
13434{
13435 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13436}
13437
13438
13439/*
13440 * PTEST (SSE 4.1) - special as it output only EFLAGS.
13441 */
13442#ifdef IEM_WITHOUT_ASSEMBLY
13443IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
13444{
13445 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13446 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13447 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13448 fEfl |= X86_EFL_ZF;
13449 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13450 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13451 fEfl |= X86_EFL_CF;
13452 *pfEFlags = fEfl;
13453}
13454#endif
13455
13456IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
13457{
13458 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13459 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13460 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
13461 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
13462 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13463 fEfl |= X86_EFL_ZF;
13464 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13465 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
13466 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
13467 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13468 fEfl |= X86_EFL_CF;
13469 *pfEFlags = fEfl;
13470}
13471
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette