VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 104099

Last change on this file since 104099 was 104076, checked in by vboxsync, 11 months ago

VMM/IEM: Implement 'microcoded' vpinsr[bwdq] instruction decode, dispatch & emulation, bugref:9898

  • eliminate '256 immediate instructions' jumptable implementations of pinsrw, vpinsrw
  • eliminate 'fallback' C implementations of pinsrw, vpinsrw
  • add 'IEM_MC_FETCH_MREG_U8' micro-op
  • add 'IEM_MC_STORE_MREG_U8, IEM_MC_STORE_MREG_U16, IEM_MC_STORE_MREG_U32' micro-ops
  • fix 'IEM_MC_STORE_XREG_U8' micro-op to store 8, not 32 bits (at the right offset)
  • fix 'IEM_MC_STORE_XREG_U16' micro-op to store 16, not 32 bits (at the right offset)
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 740.3 KB
Line 
1/* $Id: IEMAllAImplC.cpp 104076 2024-03-27 08:19:25Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF+OF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We clear AF, as that seems to make the most sense and also seems
138 * to be the correct behavior on current CPUs.
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT32U g_ar32One[];
464extern const RTFLOAT80U g_ar80One[];
465extern const RTFLOAT80U g_r80Indefinite;
466extern const RTFLOAT32U g_ar32Infinity[];
467extern const RTFLOAT64U g_ar64Infinity[];
468extern const RTFLOAT80U g_ar80Infinity[];
469extern const RTFLOAT128U g_r128Ln2;
470extern const RTUINT128U g_u128Ln2Mantissa;
471extern const RTUINT128U g_u128Ln2MantissaIntel;
472extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
473extern const RTFLOAT32U g_ar32QNaN[];
474extern const RTFLOAT64U g_ar64QNaN[];
475
476/** Zero values (indexed by fSign). */
477RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
478RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
479RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
480
481/** One values (indexed by fSign). */
482RTFLOAT32U const g_ar32One[] =
483{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
484RTFLOAT80U const g_ar80One[] =
485{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
486
487/** Indefinite (negative). */
488RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
489
490/** Infinities (indexed by fSign). */
491RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
492RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
493RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
494
495/** Default QNaNs (indexed by fSign). */
496RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
497RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
498
499
500#if 0
501/** 128-bit floating point constant: 2.0 */
502const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
503#endif
504
505
506/* The next section is generated by tools/IEMGenFpuConstants: */
507
508/** The ln2 constant as 128-bit floating point value.
509 * base-10: 6.93147180559945309417232121458176575e-1
510 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
511 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
512//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
513const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
514/** High precision ln2 value.
515 * base-10: 6.931471805599453094172321214581765680747e-1
516 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
517 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
518const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
519/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
520 * base-10: 6.931471805599453094151379470289064954613e-1
521 * base-16: b.17217f7d1cf79abc0000000000000000@-1
522 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
523const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
524
525/** Horner constants for f2xm1 */
526const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
527{
528 /* a0
529 * base-10: 1.00000000000000000000000000000000000e0
530 * base-16: 1.0000000000000000000000000000@0
531 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
532 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
533 /* a1
534 * base-10: 5.00000000000000000000000000000000000e-1
535 * base-16: 8.0000000000000000000000000000@-1
536 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
537 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
538 /* a2
539 * base-10: 1.66666666666666666666666666666666658e-1
540 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
541 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
542 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
543 /* a3
544 * base-10: 4.16666666666666666666666666666666646e-2
545 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
546 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
547 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
548 /* a4
549 * base-10: 8.33333333333333333333333333333333323e-3
550 * base-16: 2.2222222222222222222222222222@-2
551 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
552 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
553 /* a5
554 * base-10: 1.38888888888888888888888888888888874e-3
555 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
556 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
557 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
558 /* a6
559 * base-10: 1.98412698412698412698412698412698412e-4
560 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
561 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
562 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
563 /* a7
564 * base-10: 2.48015873015873015873015873015873015e-5
565 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
566 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
567 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
568 /* a8
569 * base-10: 2.75573192239858906525573192239858902e-6
570 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
571 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
572 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
573 /* a9
574 * base-10: 2.75573192239858906525573192239858865e-7
575 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
576 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
577 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
578 /* a10
579 * base-10: 2.50521083854417187750521083854417184e-8
580 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
581 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
582 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
583 /* a11
584 * base-10: 2.08767569878680989792100903212014296e-9
585 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
586 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
587 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
588 /* a12
589 * base-10: 1.60590438368216145993923771701549472e-10
590 * base-16: b.092309d43684be51c198e91d7b40@-9
591 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
592 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
593 /* a13
594 * base-10: 1.14707455977297247138516979786821043e-11
595 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
596 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
597 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
598 /* a14
599 * base-10: 7.64716373181981647590113198578806964e-13
600 * base-16: d.73f9f399dc0f88ec32b587746578@-11
601 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
602 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
603 /* a15
604 * base-10: 4.77947733238738529743820749111754352e-14
605 * base-16: d.73f9f399dc0f88ec32b587746578@-12
606 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
607 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
608 /* a16
609 * base-10: 2.81145725434552076319894558301031970e-15
610 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
611 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
612 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
613 /* a17
614 * base-10: 1.56192069685862264622163643500573321e-16
615 * base-16: b.413c31dcbecbbdd8024435161550@-14
616 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
617 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
618 /* a18
619 * base-10: 8.22063524662432971695598123687227980e-18
620 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
621 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
622 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
623 /* a19
624 * base-10: 4.11031762331216485847799061843614006e-19
625 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
626 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
627 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
628 /* a20
629 * base-10: 1.95729410633912612308475743735054143e-20
630 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
631 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
632 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
633 /* a21
634 * base-10: 8.89679139245057328674889744250246106e-22
635 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
636 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
637 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
638};
639
640
641/*
642 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
643 * it all in C is probably safer atm., optimize what's necessary later, maybe.
644 */
645#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
646
647
648/*********************************************************************************************************************************
649* Binary Operations *
650*********************************************************************************************************************************/
651
652/*
653 * ADD
654 */
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
657{
658 uint64_t uDst = *puDst;
659 uint64_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
662}
663
664# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
665
666IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
667{
668 uint32_t uDst = *puDst;
669 uint32_t uResult = uDst + uSrc;
670 *puDst = uResult;
671 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
672}
673
674
675IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
676{
677 uint16_t uDst = *puDst;
678 uint16_t uResult = uDst + uSrc;
679 *puDst = uResult;
680 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
681}
682
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
685{
686 uint8_t uDst = *puDst;
687 uint8_t uResult = uDst + uSrc;
688 *puDst = uResult;
689 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
690}
691
692# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
693
694/*
695 * ADC
696 */
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint64_t uDst = *puDst;
705 uint64_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
708 }
709}
710
711# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint32_t uDst = *puDst;
720 uint32_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
723 }
724}
725
726
727IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
728{
729 if (!(*pfEFlags & X86_EFL_CF))
730 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
731 else
732 {
733 uint16_t uDst = *puDst;
734 uint16_t uResult = uDst + uSrc + 1;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
737 }
738}
739
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
742{
743 if (!(*pfEFlags & X86_EFL_CF))
744 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
745 else
746 {
747 uint8_t uDst = *puDst;
748 uint8_t uResult = uDst + uSrc + 1;
749 *puDst = uResult;
750 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
751 }
752}
753
754# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
755
756/*
757 * SUB
758 */
759# if !defined(RT_ARCH_ARM64)
760
761IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
762{
763 uint64_t uDst = *puDst;
764 uint64_t uResult = uDst - uSrc;
765 *puDst = uResult;
766 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
767}
768
769# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
770
771IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
772{
773 uint32_t uDst = *puDst;
774 uint32_t uResult = uDst - uSrc;
775 *puDst = uResult;
776 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
777}
778
779
780IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
781{
782 uint16_t uDst = *puDst;
783 uint16_t uResult = uDst - uSrc;
784 *puDst = uResult;
785 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
786}
787
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
790{
791 uint8_t uDst = *puDst;
792 uint8_t uResult = uDst - uSrc;
793 *puDst = uResult;
794 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
795}
796
797# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
798# endif /* !RT_ARCH_ARM64 */
799
800/*
801 * SBB
802 */
803
804IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
805{
806 if (!(*pfEFlags & X86_EFL_CF))
807 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
808 else
809 {
810 uint64_t uDst = *puDst;
811 uint64_t uResult = uDst - uSrc - 1;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
814 }
815}
816
817# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
818
819IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
820{
821 if (!(*pfEFlags & X86_EFL_CF))
822 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
823 else
824 {
825 uint32_t uDst = *puDst;
826 uint32_t uResult = uDst - uSrc - 1;
827 *puDst = uResult;
828 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
829 }
830}
831
832
833IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
834{
835 if (!(*pfEFlags & X86_EFL_CF))
836 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
837 else
838 {
839 uint16_t uDst = *puDst;
840 uint16_t uResult = uDst - uSrc - 1;
841 *puDst = uResult;
842 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
843 }
844}
845
846
847IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
848{
849 if (!(*pfEFlags & X86_EFL_CF))
850 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
851 else
852 {
853 uint8_t uDst = *puDst;
854 uint8_t uResult = uDst - uSrc - 1;
855 *puDst = uResult;
856 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
857 }
858}
859
860# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
861
862
863/*
864 * OR
865 */
866
867IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
868{
869 uint64_t uResult = *puDst | uSrc;
870 *puDst = uResult;
871 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
872}
873
874# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
875
876IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
877{
878 uint32_t uResult = *puDst | uSrc;
879 *puDst = uResult;
880 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
881}
882
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
885{
886 uint16_t uResult = *puDst | uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
893{
894 uint8_t uResult = *puDst | uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
897}
898
899# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
900
901/*
902 * XOR
903 */
904
905IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
906{
907 uint64_t uResult = *puDst ^ uSrc;
908 *puDst = uResult;
909 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
910}
911
912# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
913
914IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
915{
916 uint32_t uResult = *puDst ^ uSrc;
917 *puDst = uResult;
918 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
919}
920
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
923{
924 uint16_t uResult = *puDst ^ uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
931{
932 uint8_t uResult = *puDst ^ uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * AND
941 */
942
943IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
944{
945 uint64_t const uResult = *puDst & uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
948}
949
950# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
953{
954 uint32_t const uResult = *puDst & uSrc;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
961{
962 uint16_t const uResult = *puDst & uSrc;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
965}
966
967
968IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
969{
970 uint8_t const uResult = *puDst & uSrc;
971 *puDst = uResult;
972 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
973}
974
975# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
976#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
977
978/*
979 * ANDN (BMI1 instruction)
980 */
981
982IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
983{
984 uint64_t const uResult = ~uSrc1 & uSrc2;
985 *puDst = uResult;
986 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
987}
988
989
990IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
991{
992 uint32_t const uResult = ~uSrc1 & uSrc2;
993 *puDst = uResult;
994 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
995}
996
997
998#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
999IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1000{
1001 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1002}
1003#endif
1004
1005
1006#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1007IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1008{
1009 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1010}
1011#endif
1012
1013#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1014
1015/*
1016 * CMP
1017 */
1018
1019IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1020{
1021 uint64_t uDstTmp = *puDst;
1022 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1023}
1024
1025# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1026
1027IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1028{
1029 uint32_t uDstTmp = *puDst;
1030 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1031}
1032
1033
1034IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1035{
1036 uint16_t uDstTmp = *puDst;
1037 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1038}
1039
1040
1041IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1042{
1043 uint8_t uDstTmp = *puDst;
1044 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1045}
1046
1047# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1048
1049/*
1050 * TEST
1051 */
1052
1053IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1054{
1055 uint64_t uResult = *puDst & uSrc;
1056 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1057}
1058
1059# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1060
1061IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1062{
1063 uint32_t uResult = *puDst & uSrc;
1064 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1065}
1066
1067
1068IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1069{
1070 uint16_t uResult = *puDst & uSrc;
1071 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
1072}
1073
1074
1075IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1076{
1077 uint8_t uResult = *puDst & uSrc;
1078 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
1079}
1080
1081# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1082
1083
1084/*
1085 * LOCK prefixed variants of the above
1086 */
1087
1088/** 64-bit locked binary operand operation. */
1089# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1090 do { \
1091 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1092 uint ## a_cBitsWidth ## _t uTmp; \
1093 uint32_t fEflTmp; \
1094 do \
1095 { \
1096 uTmp = uOld; \
1097 fEflTmp = *pfEFlags; \
1098 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1099 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1100 *pfEFlags = fEflTmp; \
1101 } while (0)
1102
1103
1104#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1105 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1106 uint ## a_cBitsWidth ## _t uSrc, \
1107 uint32_t *pfEFlags)) \
1108 { \
1109 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1110 }
1111
1112EMIT_LOCKED_BIN_OP(add, 64)
1113EMIT_LOCKED_BIN_OP(adc, 64)
1114EMIT_LOCKED_BIN_OP(sub, 64)
1115EMIT_LOCKED_BIN_OP(sbb, 64)
1116EMIT_LOCKED_BIN_OP(or, 64)
1117EMIT_LOCKED_BIN_OP(xor, 64)
1118EMIT_LOCKED_BIN_OP(and, 64)
1119# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1120EMIT_LOCKED_BIN_OP(add, 32)
1121EMIT_LOCKED_BIN_OP(adc, 32)
1122EMIT_LOCKED_BIN_OP(sub, 32)
1123EMIT_LOCKED_BIN_OP(sbb, 32)
1124EMIT_LOCKED_BIN_OP(or, 32)
1125EMIT_LOCKED_BIN_OP(xor, 32)
1126EMIT_LOCKED_BIN_OP(and, 32)
1127
1128EMIT_LOCKED_BIN_OP(add, 16)
1129EMIT_LOCKED_BIN_OP(adc, 16)
1130EMIT_LOCKED_BIN_OP(sub, 16)
1131EMIT_LOCKED_BIN_OP(sbb, 16)
1132EMIT_LOCKED_BIN_OP(or, 16)
1133EMIT_LOCKED_BIN_OP(xor, 16)
1134EMIT_LOCKED_BIN_OP(and, 16)
1135
1136EMIT_LOCKED_BIN_OP(add, 8)
1137EMIT_LOCKED_BIN_OP(adc, 8)
1138EMIT_LOCKED_BIN_OP(sub, 8)
1139EMIT_LOCKED_BIN_OP(sbb, 8)
1140EMIT_LOCKED_BIN_OP(or, 8)
1141EMIT_LOCKED_BIN_OP(xor, 8)
1142EMIT_LOCKED_BIN_OP(and, 8)
1143# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1144
1145
1146/*
1147 * Bit operations (same signature as above).
1148 */
1149
1150/*
1151 * BT
1152 */
1153
1154IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1155{
1156 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1157 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1158 Assert(uSrc < 64);
1159 uint64_t uDst = *puDst;
1160 if (uDst & RT_BIT_64(uSrc))
1161 *pfEFlags |= X86_EFL_CF;
1162 else
1163 *pfEFlags &= ~X86_EFL_CF;
1164}
1165
1166# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 32);
1173 uint32_t uDst = *puDst;
1174 if (uDst & RT_BIT_32(uSrc))
1175 *pfEFlags |= X86_EFL_CF;
1176 else
1177 *pfEFlags &= ~X86_EFL_CF;
1178}
1179
1180IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1181{
1182 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1183 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1184 Assert(uSrc < 16);
1185 uint16_t uDst = *puDst;
1186 if (uDst & RT_BIT_32(uSrc))
1187 *pfEFlags |= X86_EFL_CF;
1188 else
1189 *pfEFlags &= ~X86_EFL_CF;
1190}
1191
1192# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1193
1194/*
1195 * BTC
1196 */
1197
1198IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1199{
1200 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1201 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1202 Assert(uSrc < 64);
1203 uint64_t fMask = RT_BIT_64(uSrc);
1204 uint64_t uDst = *puDst;
1205 if (uDst & fMask)
1206 {
1207 uDst &= ~fMask;
1208 *puDst = uDst;
1209 *pfEFlags |= X86_EFL_CF;
1210 }
1211 else
1212 {
1213 uDst |= fMask;
1214 *puDst = uDst;
1215 *pfEFlags &= ~X86_EFL_CF;
1216 }
1217}
1218
1219# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1220
1221IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1222{
1223 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1224 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1225 Assert(uSrc < 32);
1226 uint32_t fMask = RT_BIT_32(uSrc);
1227 uint32_t uDst = *puDst;
1228 if (uDst & fMask)
1229 {
1230 uDst &= ~fMask;
1231 *puDst = uDst;
1232 *pfEFlags |= X86_EFL_CF;
1233 }
1234 else
1235 {
1236 uDst |= fMask;
1237 *puDst = uDst;
1238 *pfEFlags &= ~X86_EFL_CF;
1239 }
1240}
1241
1242
1243IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1244{
1245 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1246 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1247 Assert(uSrc < 16);
1248 uint16_t fMask = RT_BIT_32(uSrc);
1249 uint16_t uDst = *puDst;
1250 if (uDst & fMask)
1251 {
1252 uDst &= ~fMask;
1253 *puDst = uDst;
1254 *pfEFlags |= X86_EFL_CF;
1255 }
1256 else
1257 {
1258 uDst |= fMask;
1259 *puDst = uDst;
1260 *pfEFlags &= ~X86_EFL_CF;
1261 }
1262}
1263
1264# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1265
1266/*
1267 * BTR
1268 */
1269
1270IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1271{
1272 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1273 logical operation (AND/OR/whatever). */
1274 Assert(uSrc < 64);
1275 uint64_t fMask = RT_BIT_64(uSrc);
1276 uint64_t uDst = *puDst;
1277 if (uDst & fMask)
1278 {
1279 uDst &= ~fMask;
1280 *puDst = uDst;
1281 *pfEFlags |= X86_EFL_CF;
1282 }
1283 else
1284 *pfEFlags &= ~X86_EFL_CF;
1285}
1286
1287# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1288
1289IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1290{
1291 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1292 logical operation (AND/OR/whatever). */
1293 Assert(uSrc < 32);
1294 uint32_t fMask = RT_BIT_32(uSrc);
1295 uint32_t uDst = *puDst;
1296 if (uDst & fMask)
1297 {
1298 uDst &= ~fMask;
1299 *puDst = uDst;
1300 *pfEFlags |= X86_EFL_CF;
1301 }
1302 else
1303 *pfEFlags &= ~X86_EFL_CF;
1304}
1305
1306
1307IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1308{
1309 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1310 logical operation (AND/OR/whatever). */
1311 Assert(uSrc < 16);
1312 uint16_t fMask = RT_BIT_32(uSrc);
1313 uint16_t uDst = *puDst;
1314 if (uDst & fMask)
1315 {
1316 uDst &= ~fMask;
1317 *puDst = uDst;
1318 *pfEFlags |= X86_EFL_CF;
1319 }
1320 else
1321 *pfEFlags &= ~X86_EFL_CF;
1322}
1323
1324# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1325
1326/*
1327 * BTS
1328 */
1329
1330IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1331{
1332 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1333 logical operation (AND/OR/whatever). */
1334 Assert(uSrc < 64);
1335 uint64_t fMask = RT_BIT_64(uSrc);
1336 uint64_t uDst = *puDst;
1337 if (uDst & fMask)
1338 *pfEFlags |= X86_EFL_CF;
1339 else
1340 {
1341 uDst |= fMask;
1342 *puDst = uDst;
1343 *pfEFlags &= ~X86_EFL_CF;
1344 }
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 *pfEFlags |= X86_EFL_CF;
1358 else
1359 {
1360 uDst |= fMask;
1361 *puDst = uDst;
1362 *pfEFlags &= ~X86_EFL_CF;
1363 }
1364}
1365
1366
1367IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1368{
1369 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1370 logical operation (AND/OR/whatever). */
1371 Assert(uSrc < 16);
1372 uint16_t fMask = RT_BIT_32(uSrc);
1373 uint32_t uDst = *puDst;
1374 if (uDst & fMask)
1375 *pfEFlags |= X86_EFL_CF;
1376 else
1377 {
1378 uDst |= fMask;
1379 *puDst = uDst;
1380 *pfEFlags &= ~X86_EFL_CF;
1381 }
1382}
1383
1384# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1385
1386
1387EMIT_LOCKED_BIN_OP(btc, 64)
1388EMIT_LOCKED_BIN_OP(btr, 64)
1389EMIT_LOCKED_BIN_OP(bts, 64)
1390# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1391EMIT_LOCKED_BIN_OP(btc, 32)
1392EMIT_LOCKED_BIN_OP(btr, 32)
1393EMIT_LOCKED_BIN_OP(bts, 32)
1394
1395EMIT_LOCKED_BIN_OP(btc, 16)
1396EMIT_LOCKED_BIN_OP(btr, 16)
1397EMIT_LOCKED_BIN_OP(bts, 16)
1398# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1399
1400#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1401
1402/*
1403 * Helpers for BSR and BSF.
1404 *
1405 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1406 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1407 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1408 * but we restrict ourselves to emulating these recent marchs.
1409 */
1410#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1411 unsigned iBit = (a_iBit); \
1412 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1413 if (iBit) \
1414 { \
1415 *puDst = --iBit; \
1416 fEfl |= g_afParity[iBit]; \
1417 } \
1418 else \
1419 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1420 *pfEFlags = fEfl; \
1421 } while (0)
1422#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1423 unsigned const iBit = (a_iBit); \
1424 if (iBit) \
1425 { \
1426 *puDst = iBit - 1; \
1427 *pfEFlags &= ~X86_EFL_ZF; \
1428 } \
1429 else \
1430 *pfEFlags |= X86_EFL_ZF; \
1431 } while (0)
1432
1433/*
1434 * BSF - first (least significant) bit set
1435 */
1436#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1437IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1438{
1439 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1440}
1441#endif
1442
1443IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1444{
1445 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1446}
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1451}
1452
1453#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1454IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1455{
1456 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1457}
1458#endif
1459
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1468}
1469
1470
1471#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1472IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1473{
1474 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1475}
1476#endif
1477
1478IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1479{
1480 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1481}
1482
1483IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1484{
1485 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1486}
1487
1488
1489
1490/*
1491 * BSR - last (most significant) bit set
1492 */
1493#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1494IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1495{
1496 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1497}
1498#endif
1499
1500IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1501{
1502 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1503}
1504
1505IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1506{
1507 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1508}
1509
1510
1511#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516#endif
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1526}
1527
1528
1529#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1530IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1531{
1532 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1533}
1534#endif
1535
1536IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1537{
1538 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1539}
1540
1541IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1542{
1543 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1544}
1545
1546
1547/*
1548 * Helpers for LZCNT and TZCNT.
1549 */
1550#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1551 unsigned const uResult = (a_uResult); \
1552 *(a_puDst) = uResult; \
1553 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1554 if (uResult) \
1555 fEfl |= g_afParity[uResult]; \
1556 else \
1557 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1558 if (!a_uSrc) \
1559 fEfl |= X86_EFL_CF; \
1560 *(a_pfEFlags) = fEfl; \
1561 } while (0)
1562#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1563 unsigned const uResult = (a_uResult); \
1564 *(a_puDst) = uResult; \
1565 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1566 if (!uResult) \
1567 fEfl |= X86_EFL_ZF; \
1568 if (!a_uSrc) \
1569 fEfl |= X86_EFL_CF; \
1570 *(a_pfEFlags) = fEfl; \
1571 } while (0)
1572
1573
1574/*
1575 * LZCNT - count leading zero bits.
1576 */
1577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1578IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1579{
1580 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1581}
1582#endif
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1587}
1588
1589IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1590{
1591 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1592}
1593
1594
1595#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1596IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1597{
1598 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1599}
1600#endif
1601
1602IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1603{
1604 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1605}
1606
1607IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1608{
1609 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1610}
1611
1612
1613#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1614IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1615{
1616 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1617}
1618#endif
1619
1620IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1621{
1622 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1628}
1629
1630
1631/*
1632 * TZCNT - count leading zero bits.
1633 */
1634#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1635IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1636{
1637 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1638}
1639#endif
1640
1641IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1642{
1643 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1644}
1645
1646IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1647{
1648 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1649}
1650
1651
1652#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1656}
1657#endif
1658
1659IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1660{
1661 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1662}
1663
1664IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1665{
1666 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1667}
1668
1669
1670#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1671IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1672{
1673 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1674}
1675#endif
1676
1677IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1678{
1679 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1680}
1681
1682IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1683{
1684 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1685}
1686
1687
1688
1689/*
1690 * BEXTR (BMI1 instruction)
1691 */
1692#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1693IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1694 a_Type uSrc2, uint32_t *pfEFlags)) \
1695{ \
1696 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1697 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1698 a_Type uResult; \
1699 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1700 if (iFirstBit < a_cBits) \
1701 { \
1702 uResult = uSrc1 >> iFirstBit; \
1703 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1704 if (cBits < a_cBits) \
1705 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1706 *puDst = uResult; \
1707 if (!uResult) \
1708 fEfl |= X86_EFL_ZF; \
1709 } \
1710 else \
1711 { \
1712 *puDst = uResult = 0; \
1713 fEfl |= X86_EFL_ZF; \
1714 } \
1715 /** @todo complete flag calculations. */ \
1716 *pfEFlags = fEfl; \
1717}
1718
1719EMIT_BEXTR(64, uint64_t, _fallback)
1720EMIT_BEXTR(32, uint32_t, _fallback)
1721#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1722EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1723#endif
1724#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1725EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1726#endif
1727
1728/*
1729 * BLSR (BMI1 instruction)
1730 */
1731#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1732IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1733{ \
1734 uint32_t fEfl1 = *pfEFlags; \
1735 uint32_t fEfl2 = fEfl1; \
1736 *puDst = uSrc; \
1737 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1738 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1739 \
1740 /* AMD: The carry flag is from the SUB operation. */ \
1741 /* 10890xe: PF always cleared? */ \
1742 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1743 fEfl2 |= fEfl1 & X86_EFL_CF; \
1744 *pfEFlags = fEfl2; \
1745}
1746
1747EMIT_BLSR(64, uint64_t, _fallback)
1748EMIT_BLSR(32, uint32_t, _fallback)
1749#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1750EMIT_BLSR(64, uint64_t, RT_NOTHING)
1751#endif
1752#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1753EMIT_BLSR(32, uint32_t, RT_NOTHING)
1754#endif
1755
1756/*
1757 * BLSMSK (BMI1 instruction)
1758 */
1759#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1760IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1761{ \
1762 uint32_t fEfl1 = *pfEFlags; \
1763 uint32_t fEfl2 = fEfl1; \
1764 *puDst = uSrc; \
1765 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1766 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1767 \
1768 /* AMD: The carry flag is from the SUB operation. */ \
1769 /* 10890xe: PF always cleared? */ \
1770 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1771 fEfl2 |= fEfl1 & X86_EFL_CF; \
1772 *pfEFlags = fEfl2; \
1773}
1774
1775EMIT_BLSMSK(64, uint64_t, _fallback)
1776EMIT_BLSMSK(32, uint32_t, _fallback)
1777#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1778EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1779#endif
1780#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1781EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1782#endif
1783
1784/*
1785 * BLSI (BMI1 instruction)
1786 */
1787#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1788IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1789{ \
1790 uint32_t fEfl1 = *pfEFlags; \
1791 uint32_t fEfl2 = fEfl1; \
1792 *puDst = uSrc; \
1793 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1794 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1795 \
1796 /* AMD: The carry flag is from the SUB operation. */ \
1797 /* 10890xe: PF always cleared? */ \
1798 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1799 fEfl2 |= fEfl1 & X86_EFL_CF; \
1800 *pfEFlags = fEfl2; \
1801}
1802
1803EMIT_BLSI(64, uint64_t, _fallback)
1804EMIT_BLSI(32, uint32_t, _fallback)
1805#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1806EMIT_BLSI(64, uint64_t, RT_NOTHING)
1807#endif
1808#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1809EMIT_BLSI(32, uint32_t, RT_NOTHING)
1810#endif
1811
1812/*
1813 * BZHI (BMI2 instruction)
1814 */
1815#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1816IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1817 a_Type uSrc2, uint32_t *pfEFlags)) \
1818{ \
1819 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1820 a_Type uResult; \
1821 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1822 if (iFirstBit < a_cBits) \
1823 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1824 else \
1825 { \
1826 uResult = uSrc1; \
1827 fEfl |= X86_EFL_CF; \
1828 } \
1829 *puDst = uResult; \
1830 fEfl |= X86_EFL_CALC_ZF(uResult); \
1831 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1832 *pfEFlags = fEfl; \
1833}
1834
1835EMIT_BZHI(64, uint64_t, _fallback)
1836EMIT_BZHI(32, uint32_t, _fallback)
1837#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1838EMIT_BZHI(64, uint64_t, RT_NOTHING)
1839#endif
1840#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1841EMIT_BZHI(32, uint32_t, RT_NOTHING)
1842#endif
1843
1844/*
1845 * POPCNT
1846 */
1847RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1848{
1849 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1850 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1851 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1852 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1853};
1854
1855/** @todo Use native popcount where possible and employ some more efficient
1856 * algorithm here (or in asm.h fallback)! */
1857
1858DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1859{
1860 return g_abBitCounts6[ u16 & 0x3f]
1861 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1862 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1863}
1864
1865DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1866{
1867 return g_abBitCounts6[ u32 & 0x3f]
1868 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1869 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1870 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1871 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1872 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1873}
1874
1875DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1876{
1877 return g_abBitCounts6[ u64 & 0x3f]
1878 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1879 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1880 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1881 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1882 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1883 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1884 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1885 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1886 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1887 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1888}
1889
1890#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1891IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1892{ \
1893 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1894 a_Type uResult; \
1895 if (uSrc) \
1896 uResult = iemPopCountU ## a_cBits(uSrc); \
1897 else \
1898 { \
1899 fEfl |= X86_EFL_ZF; \
1900 uResult = 0; \
1901 } \
1902 *puDst = uResult; \
1903 *pfEFlags = fEfl; \
1904}
1905
1906EMIT_POPCNT(64, uint64_t, _fallback)
1907EMIT_POPCNT(32, uint32_t, _fallback)
1908EMIT_POPCNT(16, uint16_t, _fallback)
1909#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1910EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1911#endif
1912#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1913EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1914EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1915#endif
1916
1917
1918#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920/*
1921 * XCHG
1922 */
1923
1924IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1925{
1926#if ARCH_BITS >= 64
1927 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1928#else
1929 uint64_t uOldMem = *puMem;
1930 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1931 ASMNopPause();
1932 *puReg = uOldMem;
1933#endif
1934}
1935
1936# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1937
1938IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1939{
1940 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1941}
1942
1943
1944IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1945{
1946 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1947}
1948
1949
1950IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1951{
1952 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1953}
1954
1955# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1956
1957
1958/* Unlocked variants for fDisregardLock mode: */
1959
1960IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1961{
1962 uint64_t const uOld = *puMem;
1963 *puMem = *puReg;
1964 *puReg = uOld;
1965}
1966
1967# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1968
1969IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1970{
1971 uint32_t const uOld = *puMem;
1972 *puMem = *puReg;
1973 *puReg = uOld;
1974}
1975
1976
1977IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1978{
1979 uint16_t const uOld = *puMem;
1980 *puMem = *puReg;
1981 *puReg = uOld;
1982}
1983
1984
1985IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1986{
1987 uint8_t const uOld = *puMem;
1988 *puMem = *puReg;
1989 *puReg = uOld;
1990}
1991
1992# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1993
1994
1995/*
1996 * XADD and LOCK XADD.
1997 */
1998#define EMIT_XADD(a_cBitsWidth, a_Type) \
1999IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2000{ \
2001 a_Type uDst = *puDst; \
2002 a_Type uResult = uDst; \
2003 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
2004 *puDst = uResult; \
2005 *puReg = uDst; \
2006} \
2007\
2008IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2009{ \
2010 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2011 a_Type uResult; \
2012 uint32_t fEflTmp; \
2013 do \
2014 { \
2015 uResult = uOld; \
2016 fEflTmp = *pfEFlags; \
2017 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2018 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2019 *puReg = uOld; \
2020 *pfEFlags = fEflTmp; \
2021}
2022EMIT_XADD(64, uint64_t)
2023# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2024EMIT_XADD(32, uint32_t)
2025EMIT_XADD(16, uint16_t)
2026EMIT_XADD(8, uint8_t)
2027# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2028
2029#endif
2030
2031/*
2032 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2033 *
2034 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2035 * instructions are emulated as locked.
2036 */
2037#if defined(IEM_WITHOUT_ASSEMBLY)
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint8_t uOld = *puAl;
2042 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2043 Assert(*puAl == uOld);
2044 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2045}
2046
2047
2048IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2049{
2050 uint16_t uOld = *puAx;
2051 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2052 Assert(*puAx == uOld);
2053 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2054}
2055
2056
2057IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2058{
2059 uint32_t uOld = *puEax;
2060 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2061 Assert(*puEax == uOld);
2062 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2063}
2064
2065
2066# if ARCH_BITS == 32
2067IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2068# else
2069IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2070# endif
2071{
2072# if ARCH_BITS == 32
2073 uint64_t const uSrcReg = *puSrcReg;
2074# endif
2075 uint64_t uOld = *puRax;
2076 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2077 Assert(*puRax == uOld);
2078 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2079}
2080
2081
2082IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2083 uint32_t *pEFlags))
2084{
2085 uint64_t const uNew = pu64EbxEcx->u;
2086 uint64_t const uOld = pu64EaxEdx->u;
2087 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2088 {
2089 Assert(pu64EaxEdx->u == uOld);
2090 *pEFlags |= X86_EFL_ZF;
2091 }
2092 else
2093 *pEFlags &= ~X86_EFL_ZF;
2094}
2095
2096
2097# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2098IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2099 uint32_t *pEFlags))
2100{
2101# ifdef VBOX_STRICT
2102 RTUINT128U const uOld = *pu128RaxRdx;
2103# endif
2104# if defined(RT_ARCH_AMD64)
2105 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2106 &pu128RaxRdx->u))
2107# else
2108 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2109# endif
2110 {
2111 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 *pEFlags &= ~X86_EFL_ZF;
2116}
2117# endif
2118
2119#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2120
2121# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2122IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2123 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2124{
2125 RTUINT128U u128Tmp = *pu128Dst;
2126 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2127 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2128 {
2129 *pu128Dst = *pu128RbxRcx;
2130 *pEFlags |= X86_EFL_ZF;
2131 }
2132 else
2133 {
2134 *pu128RaxRdx = u128Tmp;
2135 *pEFlags &= ~X86_EFL_ZF;
2136 }
2137}
2138#endif /* !RT_ARCH_ARM64 */
2139
2140#if defined(IEM_WITHOUT_ASSEMBLY)
2141
2142/* Unlocked versions mapped to the locked ones: */
2143
2144IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2145{
2146 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2147}
2148
2149
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2151{
2152# if 0
2153 /* If correctly aligned, used the locked variation. */
2154 if (!((uintptr_t)pu16Dst & 1))
2155 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2156 else
2157# endif
2158 {
2159 /* Otherwise emulate it as best as we can. */
2160 uint16_t const uOld = *puAx;
2161 uint16_t const uDst = *pu16Dst;
2162 if (uOld == uDst)
2163 {
2164 *pu16Dst = uSrcReg;
2165 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2166 }
2167 else
2168 {
2169 *puAx = uDst;
2170 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2171 }
2172 }
2173}
2174
2175
2176IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2177{
2178# if 0
2179 /* If correctly aligned, used the locked variation. */
2180 if (!((uintptr_t)pu32Dst & 3))
2181 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2182 else
2183# endif
2184 {
2185 /* Otherwise emulate it as best as we can. */
2186 uint32_t const uOld = *puEax;
2187 uint32_t const uDst = *pu32Dst;
2188 if (uOld == uDst)
2189 {
2190 *pu32Dst = uSrcReg;
2191 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2192 }
2193 else
2194 {
2195 *puEax = uDst;
2196 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2197 }
2198 }
2199}
2200
2201
2202# if ARCH_BITS == 32
2203IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2204{
2205# if 0
2206 /* If correctly aligned, used the locked variation. */
2207 if (!((uintptr_t)pu32Dst & 7))
2208 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2209 else
2210# endif
2211 {
2212 /* Otherwise emulate it as best as we can. */
2213 uint64_t const uOld = *puRax;
2214 uint64_t const uSrc = *puSrcReg;
2215 uint64_t const uDst = *pu64Dst;
2216 if (uOld == uDst)
2217 {
2218 *pu64Dst = uSrc;
2219 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2220 }
2221 else
2222 {
2223 *puRax = uDst;
2224 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2225 }
2226 }
2227}
2228# else /* ARCH_BITS != 32 */
2229IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2230{
2231# if 0
2232 /* If correctly aligned, used the locked variation. */
2233 if (!((uintptr_t)pu64Dst & 7))
2234 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2235 else
2236# endif
2237 {
2238 /* Otherwise emulate it as best as we can. */
2239 uint64_t const uOld = *puRax;
2240 uint64_t const uDst = *pu64Dst;
2241 if (uOld == uDst)
2242 {
2243 *pu64Dst = uSrcReg;
2244 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2245 }
2246 else
2247 {
2248 *puRax = uDst;
2249 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2250 }
2251 }
2252}
2253# endif /* ARCH_BITS != 32 */
2254
2255
2256IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2257{
2258# if 0
2259 /* If correctly aligned, used the locked variation. */
2260 if (!((uintptr_t)pu64Dst & 7))
2261 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2262 else
2263# endif
2264 {
2265 /* Otherwise emulate it as best as we can. */
2266 uint64_t const uNew = pu64EbxEcx->u;
2267 uint64_t const uOld = pu64EaxEdx->u;
2268 uint64_t const uDst = *pu64Dst;
2269 if (uDst == uOld)
2270 {
2271 *pu64Dst = uNew;
2272 *pEFlags |= X86_EFL_ZF;
2273 }
2274 else
2275 {
2276 pu64EaxEdx->u = uDst;
2277 *pEFlags &= ~X86_EFL_ZF;
2278 }
2279 }
2280}
2281
2282
2283IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2284 uint32_t *pEFlags))
2285{
2286# if 0
2287 /* If correctly aligned, used the locked variation. */
2288 if (!((uintptr_t)pu64Dst & 15))
2289 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2290 else
2291# endif
2292 {
2293 /* Otherwise emulate it as best as we can. */
2294# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2295 uint128_t const uNew = pu128RbxRcx->u;
2296 uint128_t const uOld = pu128RaxRdx->u;
2297 uint128_t const uDst = pu128Dst->u;
2298 if (uDst == uOld)
2299 {
2300 pu128Dst->u = uNew;
2301 *pEFlags |= X86_EFL_ZF;
2302 }
2303 else
2304 {
2305 pu128RaxRdx->u = uDst;
2306 *pEFlags &= ~X86_EFL_ZF;
2307 }
2308# else
2309 RTUINT128U const uNew = *pu128RbxRcx;
2310 RTUINT128U const uOld = *pu128RaxRdx;
2311 RTUINT128U const uDst = *pu128Dst;
2312 if ( uDst.s.Lo == uOld.s.Lo
2313 && uDst.s.Hi == uOld.s.Hi)
2314 {
2315 *pu128Dst = uNew;
2316 *pEFlags |= X86_EFL_ZF;
2317 }
2318 else
2319 {
2320 *pu128RaxRdx = uDst;
2321 *pEFlags &= ~X86_EFL_ZF;
2322 }
2323# endif
2324 }
2325}
2326
2327#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2328
2329#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2330 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2331
2332/*
2333 * MUL, IMUL, DIV and IDIV helpers.
2334 *
2335 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2336 * division step so we can select between using C operators and
2337 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2338 *
2339 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2340 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2341 * input loads and the result storing.
2342 */
2343
2344DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2345{
2346# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2347 pQuotient->s.Lo = 0;
2348 pQuotient->s.Hi = 0;
2349# endif
2350 RTUINT128U Divisor;
2351 Divisor.s.Lo = u64Divisor;
2352 Divisor.s.Hi = 0;
2353 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2354}
2355
2356# define DIV_LOAD(a_Dividend) \
2357 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2358# define DIV_LOAD_U8(a_Dividend) \
2359 a_Dividend.u = *puAX
2360
2361# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2362# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2363
2364# define MUL_LOAD_F1() *puA
2365# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2366
2367# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2368# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2369
2370# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2371 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2372# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2373 RTUInt128AssignNeg(&(a_Value))
2374
2375# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2376 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2377# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2378 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2379
2380# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2381 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2382 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2383# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2384 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2385
2386
2387/*
2388 * MUL
2389 */
2390# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2391IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2392{ \
2393 RTUINT ## a_cBitsWidth2x ## U Result; \
2394 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2395 a_fnStore(Result); \
2396 \
2397 /* Calc EFLAGS: */ \
2398 uint32_t fEfl = *pfEFlags; \
2399 if (a_fIntelFlags) \
2400 { /* Intel: 6700K and 10980XE behavior */ \
2401 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2402 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2403 fEfl |= X86_EFL_SF; \
2404 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2405 if (Result.s.Hi != 0) \
2406 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2407 } \
2408 else \
2409 { /* AMD: 3990X */ \
2410 if (Result.s.Hi != 0) \
2411 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2412 else \
2413 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2414 } \
2415 *pfEFlags = fEfl; \
2416 return 0; \
2417} \
2418
2419# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2420 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2421 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2422 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2423
2424# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2425EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2426 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2427# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2428EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2429 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2430EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2431 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2432EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2433 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2434# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2435# endif /* !DOXYGEN_RUNNING */
2436
2437/*
2438 * MULX
2439 */
2440# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2441IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2442 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2443{ \
2444 RTUINT ## a_cBitsWidth2x ## U Result; \
2445 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2446 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2447 *puDst1 = Result.s.Hi; \
2448} \
2449
2450# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2451EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2452EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2453# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2454EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2455EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2456# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2457# endif /* !DOXYGEN_RUNNING */
2458
2459
2460/*
2461 * IMUL
2462 *
2463 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2464 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2465 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2466 */
2467# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2468 a_Suffix, a_fIntelFlags) \
2469IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2470{ \
2471 RTUINT ## a_cBitsWidth2x ## U Result; \
2472 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2473 \
2474 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2475 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2476 { \
2477 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2478 { \
2479 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2480 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2481 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2482 } \
2483 else \
2484 { \
2485 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2486 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2487 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2488 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2489 a_fnNeg(Result, a_cBitsWidth2x); \
2490 } \
2491 } \
2492 else \
2493 { \
2494 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2497 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2498 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2499 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2500 a_fnNeg(Result, a_cBitsWidth2x); \
2501 } \
2502 else \
2503 { \
2504 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2505 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2506 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2507 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2508 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2509 } \
2510 } \
2511 a_fnStore(Result); \
2512 \
2513 if (a_fIntelFlags) \
2514 { \
2515 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2516 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2517 fEfl |= X86_EFL_SF; \
2518 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2519 } \
2520 *pfEFlags = fEfl; \
2521 return 0; \
2522}
2523# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2524 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2525 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2526 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2530 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2533 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2534EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2535 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2536EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2537 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541
2542/*
2543 * IMUL with two operands are mapped onto the three operand variant, ignoring
2544 * the high part of the product.
2545 */
2546# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2547IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2548{ \
2549 a_uType uIgn; \
2550 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2551} \
2552\
2553IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2554{ \
2555 a_uType uIgn; \
2556 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2557} \
2558\
2559IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2560{ \
2561 a_uType uIgn; \
2562 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2563}
2564
2565EMIT_IMUL_TWO(64, uint64_t)
2566# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2567EMIT_IMUL_TWO(32, uint32_t)
2568EMIT_IMUL_TWO(16, uint16_t)
2569# endif
2570
2571
2572/*
2573 * DIV
2574 */
2575# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2576 a_Suffix, a_fIntelFlags) \
2577IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2578{ \
2579 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2580 a_fnLoad(Dividend); \
2581 if ( uDivisor != 0 \
2582 && Dividend.s.Hi < uDivisor) \
2583 { \
2584 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2585 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2586 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2587 \
2588 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2589 if (!a_fIntelFlags) \
2590 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2591 return 0; \
2592 } \
2593 /* #DE */ \
2594 return -1; \
2595}
2596# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2597 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2598 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2599 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2600
2601# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2602EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2603 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2605EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2606 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2607EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2608 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2609EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2610 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2611# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2612# endif /* !DOXYGEN_RUNNING */
2613
2614
2615/*
2616 * IDIV
2617 *
2618 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2619 * set AF and clear PF, ZF and SF just like it does for DIV.
2620 *
2621 */
2622# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2623 a_Suffix, a_fIntelFlags) \
2624IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2625{ \
2626 /* Note! Skylake leaves all flags alone. */ \
2627 \
2628 /** @todo overflow checks */ \
2629 if (uDivisor != 0) \
2630 { \
2631 /* \
2632 * Convert to unsigned division. \
2633 */ \
2634 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2635 a_fnLoad(Dividend); \
2636 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2637 if (fSignedDividend) \
2638 a_fnNeg(Dividend, a_cBitsWidth2x); \
2639 \
2640 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2641 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2642 uDivisorPositive = uDivisor; \
2643 else \
2644 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2645 \
2646 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2647 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2648 \
2649 /* \
2650 * Setup the result, checking for overflows. \
2651 */ \
2652 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2653 { \
2654 if (!fSignedDividend) \
2655 { \
2656 /* Positive divisor, positive dividend => result positive. */ \
2657 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2658 { \
2659 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2660 if (!a_fIntelFlags) \
2661 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2662 return 0; \
2663 } \
2664 } \
2665 else \
2666 { \
2667 /* Positive divisor, negative dividend => result negative. */ \
2668 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2669 { \
2670 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2671 if (!a_fIntelFlags) \
2672 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2673 return 0; \
2674 } \
2675 } \
2676 } \
2677 else \
2678 { \
2679 if (!fSignedDividend) \
2680 { \
2681 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2682 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2683 { \
2684 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2685 if (!a_fIntelFlags) \
2686 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2687 return 0; \
2688 } \
2689 } \
2690 else \
2691 { \
2692 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2693 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2694 { \
2695 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2696 if (!a_fIntelFlags) \
2697 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2698 return 0; \
2699 } \
2700 } \
2701 } \
2702 } \
2703 /* #DE */ \
2704 return -1; \
2705}
2706# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2707 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2708 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2709 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2710
2711# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2712EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2713 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2714# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2715EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2716 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2717EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2718 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2719EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2720 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2721# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2722# endif /* !DOXYGEN_RUNNING */
2723
2724#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2725
2726
2727/*********************************************************************************************************************************
2728* Unary operations. *
2729*********************************************************************************************************************************/
2730#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2731
2732/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2733 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2734 *
2735 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2736 * borrowing in arithmetic loops on intel 8008).
2737 *
2738 * @returns Status bits.
2739 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2740 * @param a_uResult Unsigned result value.
2741 * @param a_uDst The original destination value (for AF calc).
2742 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2743 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2744 */
2745#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2746 do { \
2747 uint32_t fEflTmp = *(a_pfEFlags); \
2748 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2749 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2750 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2751 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2752 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2753 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2754 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2755 *(a_pfEFlags) = fEflTmp; \
2756 } while (0)
2757
2758/*
2759 * INC
2760 */
2761
2762IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2763{
2764 uint64_t uDst = *puDst;
2765 uint64_t uResult = uDst + 1;
2766 *puDst = uResult;
2767 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2768}
2769
2770# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2771
2772IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2773{
2774 uint32_t uDst = *puDst;
2775 uint32_t uResult = uDst + 1;
2776 *puDst = uResult;
2777 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2778}
2779
2780
2781IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2782{
2783 uint16_t uDst = *puDst;
2784 uint16_t uResult = uDst + 1;
2785 *puDst = uResult;
2786 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2787}
2788
2789IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2790{
2791 uint8_t uDst = *puDst;
2792 uint8_t uResult = uDst + 1;
2793 *puDst = uResult;
2794 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2795}
2796
2797# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2798
2799
2800/*
2801 * DEC
2802 */
2803
2804IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2805{
2806 uint64_t uDst = *puDst;
2807 uint64_t uResult = uDst - 1;
2808 *puDst = uResult;
2809 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2810}
2811
2812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2813
2814IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2815{
2816 uint32_t uDst = *puDst;
2817 uint32_t uResult = uDst - 1;
2818 *puDst = uResult;
2819 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2820}
2821
2822
2823IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2824{
2825 uint16_t uDst = *puDst;
2826 uint16_t uResult = uDst - 1;
2827 *puDst = uResult;
2828 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2829}
2830
2831
2832IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2833{
2834 uint8_t uDst = *puDst;
2835 uint8_t uResult = uDst - 1;
2836 *puDst = uResult;
2837 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2838}
2839
2840# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2841
2842
2843/*
2844 * NOT
2845 */
2846
2847IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2848{
2849 uint64_t uDst = *puDst;
2850 uint64_t uResult = ~uDst;
2851 *puDst = uResult;
2852 /* EFLAGS are not modified. */
2853 RT_NOREF_PV(pfEFlags);
2854}
2855
2856# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2857
2858IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2859{
2860 uint32_t uDst = *puDst;
2861 uint32_t uResult = ~uDst;
2862 *puDst = uResult;
2863 /* EFLAGS are not modified. */
2864 RT_NOREF_PV(pfEFlags);
2865}
2866
2867IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2868{
2869 uint16_t uDst = *puDst;
2870 uint16_t uResult = ~uDst;
2871 *puDst = uResult;
2872 /* EFLAGS are not modified. */
2873 RT_NOREF_PV(pfEFlags);
2874}
2875
2876IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2877{
2878 uint8_t uDst = *puDst;
2879 uint8_t uResult = ~uDst;
2880 *puDst = uResult;
2881 /* EFLAGS are not modified. */
2882 RT_NOREF_PV(pfEFlags);
2883}
2884
2885# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2886
2887
2888/*
2889 * NEG
2890 */
2891
2892/**
2893 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2894 *
2895 * @returns Status bits.
2896 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2897 * @param a_uResult Unsigned result value.
2898 * @param a_uDst The original destination value (for AF calc).
2899 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2900 */
2901#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2902 do { \
2903 uint32_t fEflTmp = *(a_pfEFlags); \
2904 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2905 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2906 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2907 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2908 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2909 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2910 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2911 *(a_pfEFlags) = fEflTmp; \
2912 } while (0)
2913
2914IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2915{
2916 uint64_t uDst = *puDst;
2917 uint64_t uResult = (uint64_t)0 - uDst;
2918 *puDst = uResult;
2919 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2920}
2921
2922# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2923
2924IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2925{
2926 uint32_t uDst = *puDst;
2927 uint32_t uResult = (uint32_t)0 - uDst;
2928 *puDst = uResult;
2929 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2930}
2931
2932
2933IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2934{
2935 uint16_t uDst = *puDst;
2936 uint16_t uResult = (uint16_t)0 - uDst;
2937 *puDst = uResult;
2938 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2939}
2940
2941
2942IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2943{
2944 uint8_t uDst = *puDst;
2945 uint8_t uResult = (uint8_t)0 - uDst;
2946 *puDst = uResult;
2947 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2948}
2949
2950# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2951
2952/*
2953 * Locked variants.
2954 */
2955
2956/** Emit a function for doing a locked unary operand operation. */
2957# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2958 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2959 uint32_t *pfEFlags)) \
2960 { \
2961 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2962 uint ## a_cBitsWidth ## _t uTmp; \
2963 uint32_t fEflTmp; \
2964 do \
2965 { \
2966 uTmp = uOld; \
2967 fEflTmp = *pfEFlags; \
2968 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2969 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2970 *pfEFlags = fEflTmp; \
2971 }
2972
2973EMIT_LOCKED_UNARY_OP(inc, 64)
2974EMIT_LOCKED_UNARY_OP(dec, 64)
2975EMIT_LOCKED_UNARY_OP(not, 64)
2976EMIT_LOCKED_UNARY_OP(neg, 64)
2977# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2978EMIT_LOCKED_UNARY_OP(inc, 32)
2979EMIT_LOCKED_UNARY_OP(dec, 32)
2980EMIT_LOCKED_UNARY_OP(not, 32)
2981EMIT_LOCKED_UNARY_OP(neg, 32)
2982
2983EMIT_LOCKED_UNARY_OP(inc, 16)
2984EMIT_LOCKED_UNARY_OP(dec, 16)
2985EMIT_LOCKED_UNARY_OP(not, 16)
2986EMIT_LOCKED_UNARY_OP(neg, 16)
2987
2988EMIT_LOCKED_UNARY_OP(inc, 8)
2989EMIT_LOCKED_UNARY_OP(dec, 8)
2990EMIT_LOCKED_UNARY_OP(not, 8)
2991EMIT_LOCKED_UNARY_OP(neg, 8)
2992# endif
2993
2994#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2995
2996
2997/*********************************************************************************************************************************
2998* Shifting and Rotating *
2999*********************************************************************************************************************************/
3000
3001/*
3002 * ROL
3003 */
3004#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3005IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3006{ \
3007 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3008 if (cShift) \
3009 { \
3010 if (a_cBitsWidth < 32) \
3011 cShift &= a_cBitsWidth - 1; \
3012 a_uType const uDst = *puDst; \
3013 a_uType const uResult = a_fnHlp(uDst, cShift); \
3014 *puDst = uResult; \
3015 \
3016 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3017 it the same way as for 1 bit shifts. */ \
3018 AssertCompile(X86_EFL_CF_BIT == 0); \
3019 uint32_t fEfl = *pfEFlags; \
3020 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3021 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3022 fEfl |= fCarry; \
3023 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3024 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3025 else /* Intel 10980XE: According to the first sub-shift: */ \
3026 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3027 *pfEFlags = fEfl; \
3028 } \
3029}
3030
3031#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3032EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3033#endif
3034EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3035EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3036
3037#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3038EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3039#endif
3040EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3041EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3042
3043DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3044{
3045 return (uValue << cShift) | (uValue >> (16 - cShift));
3046}
3047#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3048EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3049#endif
3050EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3051EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3052
3053DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3054{
3055 return (uValue << cShift) | (uValue >> (8 - cShift));
3056}
3057#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3058EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3059#endif
3060EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3061EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3062
3063
3064/*
3065 * ROR
3066 */
3067#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3068IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3069{ \
3070 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3071 if (cShift) \
3072 { \
3073 if (a_cBitsWidth < 32) \
3074 cShift &= a_cBitsWidth - 1; \
3075 a_uType const uDst = *puDst; \
3076 a_uType const uResult = a_fnHlp(uDst, cShift); \
3077 *puDst = uResult; \
3078 \
3079 /* Calc EFLAGS: */ \
3080 AssertCompile(X86_EFL_CF_BIT == 0); \
3081 uint32_t fEfl = *pfEFlags; \
3082 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3083 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3084 fEfl |= fCarry; \
3085 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3086 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3087 else /* Intel 10980XE: According to the first sub-shift: */ \
3088 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3089 *pfEFlags = fEfl; \
3090 } \
3091}
3092
3093#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3094EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3095#endif
3096EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3097EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3098
3099#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3100EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3101#endif
3102EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3103EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3104
3105DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3106{
3107 return (uValue >> cShift) | (uValue << (16 - cShift));
3108}
3109#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3110EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3111#endif
3112EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3113EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3114
3115DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3116{
3117 return (uValue >> cShift) | (uValue << (8 - cShift));
3118}
3119#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3120EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3121#endif
3122EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3123EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3124
3125
3126/*
3127 * RCL
3128 */
3129#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3130IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3131{ \
3132 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3133 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3134 cShift %= a_cBitsWidth + 1; \
3135 if (cShift) \
3136 { \
3137 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3138 cShift %= a_cBitsWidth + 1; \
3139 a_uType const uDst = *puDst; \
3140 a_uType uResult = uDst << cShift; \
3141 if (cShift > 1) \
3142 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3143 \
3144 AssertCompile(X86_EFL_CF_BIT == 0); \
3145 uint32_t fEfl = *pfEFlags; \
3146 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3147 uResult |= (a_uType)fInCarry << (cShift - 1); \
3148 \
3149 *puDst = uResult; \
3150 \
3151 /* Calc EFLAGS. */ \
3152 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3153 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3154 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3155 fEfl |= fOutCarry; \
3156 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3157 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3158 else /* Intel 10980XE: According to the first sub-shift: */ \
3159 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3160 *pfEFlags = fEfl; \
3161 } \
3162}
3163
3164#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3165EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3166#endif
3167EMIT_RCL(64, uint64_t, _intel, 1)
3168EMIT_RCL(64, uint64_t, _amd, 0)
3169
3170#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3171EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3172#endif
3173EMIT_RCL(32, uint32_t, _intel, 1)
3174EMIT_RCL(32, uint32_t, _amd, 0)
3175
3176#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3177EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3178#endif
3179EMIT_RCL(16, uint16_t, _intel, 1)
3180EMIT_RCL(16, uint16_t, _amd, 0)
3181
3182#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3183EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3184#endif
3185EMIT_RCL(8, uint8_t, _intel, 1)
3186EMIT_RCL(8, uint8_t, _amd, 0)
3187
3188
3189/*
3190 * RCR
3191 */
3192#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3193IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3194{ \
3195 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3196 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3197 cShift %= a_cBitsWidth + 1; \
3198 if (cShift) \
3199 { \
3200 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3201 cShift %= a_cBitsWidth + 1; \
3202 a_uType const uDst = *puDst; \
3203 a_uType uResult = uDst >> cShift; \
3204 if (cShift > 1) \
3205 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3206 \
3207 AssertCompile(X86_EFL_CF_BIT == 0); \
3208 uint32_t fEfl = *pfEFlags; \
3209 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3210 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3211 *puDst = uResult; \
3212 \
3213 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3214 it the same way as for 1 bit shifts. */ \
3215 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3216 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3217 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3218 fEfl |= fOutCarry; \
3219 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3220 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3221 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3222 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3223 *pfEFlags = fEfl; \
3224 } \
3225}
3226
3227#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3228EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3229#endif
3230EMIT_RCR(64, uint64_t, _intel, 1)
3231EMIT_RCR(64, uint64_t, _amd, 0)
3232
3233#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3234EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3235#endif
3236EMIT_RCR(32, uint32_t, _intel, 1)
3237EMIT_RCR(32, uint32_t, _amd, 0)
3238
3239#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3240EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3241#endif
3242EMIT_RCR(16, uint16_t, _intel, 1)
3243EMIT_RCR(16, uint16_t, _amd, 0)
3244
3245#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3246EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3247#endif
3248EMIT_RCR(8, uint8_t, _intel, 1)
3249EMIT_RCR(8, uint8_t, _amd, 0)
3250
3251
3252/*
3253 * SHL
3254 */
3255#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3256IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3257{ \
3258 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3259 if (cShift) \
3260 { \
3261 a_uType const uDst = *puDst; \
3262 a_uType uResult = uDst << cShift; \
3263 *puDst = uResult; \
3264 \
3265 /* Calc EFLAGS. */ \
3266 AssertCompile(X86_EFL_CF_BIT == 0); \
3267 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3268 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3269 fEfl |= fCarry; \
3270 if (!a_fIntelFlags) \
3271 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3272 else \
3273 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3274 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3275 fEfl |= X86_EFL_CALC_ZF(uResult); \
3276 fEfl |= g_afParity[uResult & 0xff]; \
3277 if (!a_fIntelFlags) \
3278 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3279 *pfEFlags = fEfl; \
3280 } \
3281}
3282
3283#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHL(64, uint64_t, _intel, 1)
3287EMIT_SHL(64, uint64_t, _amd, 0)
3288
3289#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3290EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3291#endif
3292EMIT_SHL(32, uint32_t, _intel, 1)
3293EMIT_SHL(32, uint32_t, _amd, 0)
3294
3295#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3296EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3297#endif
3298EMIT_SHL(16, uint16_t, _intel, 1)
3299EMIT_SHL(16, uint16_t, _amd, 0)
3300
3301#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3302EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3303#endif
3304EMIT_SHL(8, uint8_t, _intel, 1)
3305EMIT_SHL(8, uint8_t, _amd, 0)
3306
3307
3308/*
3309 * SHR
3310 */
3311#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3312IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3313{ \
3314 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3315 if (cShift) \
3316 { \
3317 a_uType const uDst = *puDst; \
3318 a_uType uResult = uDst >> cShift; \
3319 *puDst = uResult; \
3320 \
3321 /* Calc EFLAGS. */ \
3322 AssertCompile(X86_EFL_CF_BIT == 0); \
3323 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3324 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3325 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3326 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3327 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3328 fEfl |= X86_EFL_CALC_ZF(uResult); \
3329 fEfl |= g_afParity[uResult & 0xff]; \
3330 if (!a_fIntelFlags) \
3331 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3332 *pfEFlags = fEfl; \
3333 } \
3334}
3335
3336#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3337EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3338#endif
3339EMIT_SHR(64, uint64_t, _intel, 1)
3340EMIT_SHR(64, uint64_t, _amd, 0)
3341
3342#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3343EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3344#endif
3345EMIT_SHR(32, uint32_t, _intel, 1)
3346EMIT_SHR(32, uint32_t, _amd, 0)
3347
3348#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3349EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3350#endif
3351EMIT_SHR(16, uint16_t, _intel, 1)
3352EMIT_SHR(16, uint16_t, _amd, 0)
3353
3354#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3355EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3356#endif
3357EMIT_SHR(8, uint8_t, _intel, 1)
3358EMIT_SHR(8, uint8_t, _amd, 0)
3359
3360
3361/*
3362 * SAR
3363 */
3364#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3365IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3366{ \
3367 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3368 if (cShift) \
3369 { \
3370 a_iType const iDst = (a_iType)*puDst; \
3371 a_uType uResult = iDst >> cShift; \
3372 *puDst = uResult; \
3373 \
3374 /* Calc EFLAGS. \
3375 Note! The OF flag is always zero because the result never differs from the input. */ \
3376 AssertCompile(X86_EFL_CF_BIT == 0); \
3377 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3378 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3379 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3380 fEfl |= X86_EFL_CALC_ZF(uResult); \
3381 fEfl |= g_afParity[uResult & 0xff]; \
3382 if (!a_fIntelFlags) \
3383 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3384 *pfEFlags = fEfl; \
3385 } \
3386}
3387
3388#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3389EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3390#endif
3391EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3392EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3393
3394#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3395EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3396#endif
3397EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3398EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3399
3400#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3401EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3402#endif
3403EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3404EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3405
3406#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3407EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3408#endif
3409EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3410EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3411
3412
3413/*
3414 * SHLD
3415 *
3416 * - CF is the last bit shifted out of puDst.
3417 * - AF is always cleared by Intel 10980XE.
3418 * - AF is always set by AMD 3990X.
3419 * - OF is set according to the first shift on Intel 10980XE, it seems.
3420 * - OF is set according to the last sub-shift on AMD 3990X.
3421 * - ZF, SF and PF are calculated according to the result by both vendors.
3422 *
3423 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3424 * pick either the source register or the destination register for input bits
3425 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3426 * intel has changed behaviour here several times. We implement what current
3427 * skylake based does for now, we can extend this later as needed.
3428 */
3429#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3430IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3431 uint32_t *pfEFlags)) \
3432{ \
3433 cShift &= a_cBitsWidth - 1; \
3434 if (cShift) \
3435 { \
3436 a_uType const uDst = *puDst; \
3437 a_uType uResult = uDst << cShift; \
3438 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3439 *puDst = uResult; \
3440 \
3441 /* CALC EFLAGS: */ \
3442 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3443 if (a_fIntelFlags) \
3444 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3445 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3446 else \
3447 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3448 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3449 fEfl |= X86_EFL_AF; \
3450 } \
3451 AssertCompile(X86_EFL_CF_BIT == 0); \
3452 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3453 fEfl |= g_afParity[uResult & 0xff]; \
3454 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3455 fEfl |= X86_EFL_CALC_ZF(uResult); \
3456 *pfEFlags = fEfl; \
3457 } \
3458}
3459
3460#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3461EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3462#endif
3463EMIT_SHLD(64, uint64_t, _intel, 1)
3464EMIT_SHLD(64, uint64_t, _amd, 0)
3465
3466#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3467EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3468#endif
3469EMIT_SHLD(32, uint32_t, _intel, 1)
3470EMIT_SHLD(32, uint32_t, _amd, 0)
3471
3472#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3473IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3474{ \
3475 cShift &= 31; \
3476 if (cShift) \
3477 { \
3478 uint16_t const uDst = *puDst; \
3479 uint64_t const uTmp = a_fIntelFlags \
3480 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3481 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3482 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3483 *puDst = uResult; \
3484 \
3485 /* CALC EFLAGS: */ \
3486 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3487 AssertCompile(X86_EFL_CF_BIT == 0); \
3488 if (a_fIntelFlags) \
3489 { \
3490 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3491 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3492 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3493 } \
3494 else \
3495 { \
3496 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3497 if (cShift < 16) \
3498 { \
3499 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3500 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3501 } \
3502 else \
3503 { \
3504 if (cShift == 16) \
3505 fEfl |= uDst & X86_EFL_CF; \
3506 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3507 } \
3508 fEfl |= X86_EFL_AF; \
3509 } \
3510 fEfl |= g_afParity[uResult & 0xff]; \
3511 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3512 fEfl |= X86_EFL_CALC_ZF(uResult); \
3513 *pfEFlags = fEfl; \
3514 } \
3515}
3516
3517#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3518EMIT_SHLD_16(RT_NOTHING, 1)
3519#endif
3520EMIT_SHLD_16(_intel, 1)
3521EMIT_SHLD_16(_amd, 0)
3522
3523
3524/*
3525 * SHRD
3526 *
3527 * EFLAGS behaviour seems to be the same as with SHLD:
3528 * - CF is the last bit shifted out of puDst.
3529 * - AF is always cleared by Intel 10980XE.
3530 * - AF is always set by AMD 3990X.
3531 * - OF is set according to the first shift on Intel 10980XE, it seems.
3532 * - OF is set according to the last sub-shift on AMD 3990X.
3533 * - ZF, SF and PF are calculated according to the result by both vendors.
3534 *
3535 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3536 * pick either the source register or the destination register for input bits
3537 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3538 * intel has changed behaviour here several times. We implement what current
3539 * skylake based does for now, we can extend this later as needed.
3540 */
3541#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3542IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3543{ \
3544 cShift &= a_cBitsWidth - 1; \
3545 if (cShift) \
3546 { \
3547 a_uType const uDst = *puDst; \
3548 a_uType uResult = uDst >> cShift; \
3549 uResult |= uSrc << (a_cBitsWidth - cShift); \
3550 *puDst = uResult; \
3551 \
3552 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3553 AssertCompile(X86_EFL_CF_BIT == 0); \
3554 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3555 if (a_fIntelFlags) \
3556 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3557 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3558 else \
3559 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3560 if (cShift > 1) /* Set according to last shift. */ \
3561 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3562 else \
3563 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3564 fEfl |= X86_EFL_AF; \
3565 } \
3566 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3567 fEfl |= X86_EFL_CALC_ZF(uResult); \
3568 fEfl |= g_afParity[uResult & 0xff]; \
3569 *pfEFlags = fEfl; \
3570 } \
3571}
3572
3573#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3574EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3575#endif
3576EMIT_SHRD(64, uint64_t, _intel, 1)
3577EMIT_SHRD(64, uint64_t, _amd, 0)
3578
3579#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3580EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3581#endif
3582EMIT_SHRD(32, uint32_t, _intel, 1)
3583EMIT_SHRD(32, uint32_t, _amd, 0)
3584
3585#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3586IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3587{ \
3588 cShift &= 31; \
3589 if (cShift) \
3590 { \
3591 uint16_t const uDst = *puDst; \
3592 uint64_t const uTmp = a_fIntelFlags \
3593 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3594 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3595 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3596 *puDst = uResult; \
3597 \
3598 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3599 AssertCompile(X86_EFL_CF_BIT == 0); \
3600 if (a_fIntelFlags) \
3601 { \
3602 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3603 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3604 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3605 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3606 } \
3607 else \
3608 { \
3609 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3610 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3611 /* AMD 3990X: Set according to last shift. AF always set. */ \
3612 if (cShift > 1) /* Set according to last shift. */ \
3613 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3614 else \
3615 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3616 fEfl |= X86_EFL_AF; \
3617 } \
3618 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3619 fEfl |= X86_EFL_CALC_ZF(uResult); \
3620 fEfl |= g_afParity[uResult & 0xff]; \
3621 *pfEFlags = fEfl; \
3622 } \
3623}
3624
3625#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3626EMIT_SHRD_16(RT_NOTHING, 1)
3627#endif
3628EMIT_SHRD_16(_intel, 1)
3629EMIT_SHRD_16(_amd, 0)
3630
3631
3632/*
3633 * RORX (BMI2)
3634 */
3635#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3636IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3637{ \
3638 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3639}
3640
3641#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3642EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3643#endif
3644#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3645EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3646#endif
3647
3648
3649/*
3650 * SHLX (BMI2)
3651 */
3652#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3653IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3654{ \
3655 cShift &= a_cBitsWidth - 1; \
3656 *puDst = uSrc << cShift; \
3657}
3658
3659#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3660EMIT_SHLX(64, uint64_t, RT_NOTHING)
3661EMIT_SHLX(64, uint64_t, _fallback)
3662#endif
3663#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3664EMIT_SHLX(32, uint32_t, RT_NOTHING)
3665EMIT_SHLX(32, uint32_t, _fallback)
3666#endif
3667
3668
3669/*
3670 * SHRX (BMI2)
3671 */
3672#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3673IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3674{ \
3675 cShift &= a_cBitsWidth - 1; \
3676 *puDst = uSrc >> cShift; \
3677}
3678
3679#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3680EMIT_SHRX(64, uint64_t, RT_NOTHING)
3681EMIT_SHRX(64, uint64_t, _fallback)
3682#endif
3683#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3684EMIT_SHRX(32, uint32_t, RT_NOTHING)
3685EMIT_SHRX(32, uint32_t, _fallback)
3686#endif
3687
3688
3689/*
3690 * SARX (BMI2)
3691 */
3692#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3693IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3694{ \
3695 cShift &= a_cBitsWidth - 1; \
3696 *puDst = (a_iType)uSrc >> cShift; \
3697}
3698
3699#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3700EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3701EMIT_SARX(64, uint64_t, int64_t, _fallback)
3702#endif
3703#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3704EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3705EMIT_SARX(32, uint32_t, int32_t, _fallback)
3706#endif
3707
3708
3709/*
3710 * PDEP (BMI2)
3711 */
3712#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3713IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3714{ \
3715 a_uType uResult = 0; \
3716 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3717 if (fMask & ((a_uType)1 << iMaskBit)) \
3718 { \
3719 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3720 iBit++; \
3721 } \
3722 *puDst = uResult; \
3723}
3724
3725#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3726EMIT_PDEP(64, uint64_t, RT_NOTHING)
3727#endif
3728EMIT_PDEP(64, uint64_t, _fallback)
3729#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3730EMIT_PDEP(32, uint32_t, RT_NOTHING)
3731#endif
3732EMIT_PDEP(32, uint32_t, _fallback)
3733
3734/*
3735 * PEXT (BMI2)
3736 */
3737#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3738IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3739{ \
3740 a_uType uResult = 0; \
3741 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3742 if (fMask & ((a_uType)1 << iMaskBit)) \
3743 { \
3744 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3745 iBit++; \
3746 } \
3747 *puDst = uResult; \
3748}
3749
3750#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3751EMIT_PEXT(64, uint64_t, RT_NOTHING)
3752#endif
3753EMIT_PEXT(64, uint64_t, _fallback)
3754#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3755EMIT_PEXT(32, uint32_t, RT_NOTHING)
3756#endif
3757EMIT_PEXT(32, uint32_t, _fallback)
3758
3759
3760#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3761
3762# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3763/*
3764 * BSWAP
3765 */
3766
3767IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3768{
3769 *puDst = ASMByteSwapU64(*puDst);
3770}
3771
3772
3773IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3774{
3775 *puDst = ASMByteSwapU32(*puDst);
3776}
3777
3778
3779/* Note! undocument, so 32-bit arg */
3780IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3781{
3782#if 0
3783 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3784#else
3785 /* This is the behaviour AMD 3990x (64-bit mode): */
3786 *(uint16_t *)puDst = 0;
3787#endif
3788}
3789
3790# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3791
3792
3793
3794# if defined(IEM_WITHOUT_ASSEMBLY)
3795
3796/*
3797 * LFENCE, SFENCE & MFENCE.
3798 */
3799
3800IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3801{
3802 ASMReadFence();
3803}
3804
3805
3806IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3807{
3808 ASMWriteFence();
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3813{
3814 ASMMemoryFence();
3815}
3816
3817
3818# ifndef RT_ARCH_ARM64
3819IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3820{
3821 ASMMemoryFence();
3822}
3823# endif
3824
3825# endif
3826
3827#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3828
3829
3830IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3831{
3832 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3833 {
3834 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3835 *pu16Dst |= u16Src & X86_SEL_RPL;
3836
3837 *pfEFlags |= X86_EFL_ZF;
3838 }
3839 else
3840 *pfEFlags &= ~X86_EFL_ZF;
3841}
3842
3843
3844#if defined(IEM_WITHOUT_ASSEMBLY)
3845
3846/*********************************************************************************************************************************
3847* x87 FPU Loads *
3848*********************************************************************************************************************************/
3849
3850IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3851{
3852 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3853 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3854 {
3855 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3858 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3859 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3860 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3861 }
3862 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3863 {
3864 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3865 pFpuRes->r80Result.s.uExponent = 0;
3866 pFpuRes->r80Result.s.uMantissa = 0;
3867 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3868 }
3869 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3870 {
3871 /* Subnormal values gets normalized. */
3872 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3873 pFpuRes->r80Result.sj64.fInteger = 1;
3874 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3875 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3876 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3877 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3878 pFpuRes->FSW |= X86_FSW_DE;
3879 if (!(pFpuState->FCW & X86_FCW_DM))
3880 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3881 }
3882 else if (RTFLOAT32U_IS_INF(pr32Val))
3883 {
3884 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3885 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3886 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3887 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3888 }
3889 else
3890 {
3891 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3892 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3893 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3894 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3895 pFpuRes->r80Result.sj64.fInteger = 1;
3896 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3897 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3898 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3899 {
3900 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3901 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3902 pFpuRes->FSW |= X86_FSW_IE;
3903
3904 if (!(pFpuState->FCW & X86_FCW_IM))
3905 {
3906 /* The value is not pushed. */
3907 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3908 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3909 pFpuRes->r80Result.au64[0] = 0;
3910 pFpuRes->r80Result.au16[4] = 0;
3911 }
3912 }
3913 else
3914 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3915 }
3916}
3917
3918
3919IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3920{
3921 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3922 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3923 {
3924 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3925 pFpuRes->r80Result.sj64.fInteger = 1;
3926 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3927 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3928 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3929 }
3930 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3931 {
3932 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3933 pFpuRes->r80Result.s.uExponent = 0;
3934 pFpuRes->r80Result.s.uMantissa = 0;
3935 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3936 }
3937 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3938 {
3939 /* Subnormal values gets normalized. */
3940 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3941 pFpuRes->r80Result.sj64.fInteger = 1;
3942 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3943 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3944 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3945 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3946 pFpuRes->FSW |= X86_FSW_DE;
3947 if (!(pFpuState->FCW & X86_FCW_DM))
3948 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3949 }
3950 else if (RTFLOAT64U_IS_INF(pr64Val))
3951 {
3952 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3953 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3954 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3955 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3956 }
3957 else
3958 {
3959 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3960 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3961 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3962 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3963 pFpuRes->r80Result.sj64.fInteger = 1;
3964 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3965 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3966 {
3967 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3968 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3969 pFpuRes->FSW |= X86_FSW_IE;
3970
3971 if (!(pFpuState->FCW & X86_FCW_IM))
3972 {
3973 /* The value is not pushed. */
3974 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3975 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3976 pFpuRes->r80Result.au64[0] = 0;
3977 pFpuRes->r80Result.au16[4] = 0;
3978 }
3979 }
3980 else
3981 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3982 }
3983}
3984
3985
3986IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3987{
3988 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3989 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3990 /* Raises no exceptions. */
3991 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3992}
3993
3994
3995IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3996{
3997 pFpuRes->r80Result.sj64.fSign = 0;
3998 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3999 pFpuRes->r80Result.sj64.fInteger = 1;
4000 pFpuRes->r80Result.sj64.uFraction = 0;
4001
4002 /*
4003 * FPU status word:
4004 * - TOP is irrelevant, but we must match x86 assembly version.
4005 * - C1 is always cleared as we don't have any stack overflows.
4006 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4007 */
4008 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4009}
4010
4011
4012IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4013{
4014 pFpuRes->r80Result.sj64.fSign = 0;
4015 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4016 pFpuRes->r80Result.sj64.fInteger = 1;
4017 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4018 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4019 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4020 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4021}
4022
4023
4024IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4025{
4026 pFpuRes->r80Result.sj64.fSign = 0;
4027 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4028 pFpuRes->r80Result.sj64.fInteger = 1;
4029 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4030 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4031 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4032}
4033
4034
4035IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4036{
4037 pFpuRes->r80Result.sj64.fSign = 0;
4038 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4039 pFpuRes->r80Result.sj64.fInteger = 1;
4040 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4041 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4042 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4043 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4044}
4045
4046
4047IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4048{
4049 pFpuRes->r80Result.sj64.fSign = 0;
4050 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4051 pFpuRes->r80Result.sj64.fInteger = 1;
4052 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4053 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4054 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4055 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4056}
4057
4058
4059IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4060{
4061 pFpuRes->r80Result.sj64.fSign = 0;
4062 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4063 pFpuRes->r80Result.sj64.fInteger = 1;
4064 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4065 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4066 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4067 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4068}
4069
4070
4071IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4072{
4073 pFpuRes->r80Result.s.fSign = 0;
4074 pFpuRes->r80Result.s.uExponent = 0;
4075 pFpuRes->r80Result.s.uMantissa = 0;
4076 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4077}
4078
4079#define EMIT_FILD(a_cBits) \
4080IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4081 int ## a_cBits ## _t const *piVal)) \
4082{ \
4083 int ## a_cBits ## _t iVal = *piVal; \
4084 if (iVal == 0) \
4085 { \
4086 pFpuRes->r80Result.s.fSign = 0; \
4087 pFpuRes->r80Result.s.uExponent = 0; \
4088 pFpuRes->r80Result.s.uMantissa = 0; \
4089 } \
4090 else \
4091 { \
4092 if (iVal > 0) \
4093 pFpuRes->r80Result.s.fSign = 0; \
4094 else \
4095 { \
4096 pFpuRes->r80Result.s.fSign = 1; \
4097 iVal = -iVal; \
4098 } \
4099 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4100 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4101 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4102 } \
4103 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4104}
4105EMIT_FILD(16)
4106EMIT_FILD(32)
4107EMIT_FILD(64)
4108
4109
4110IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4111{
4112 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4113 if ( pd80Val->s.abPairs[0] == 0
4114 && pd80Val->s.abPairs[1] == 0
4115 && pd80Val->s.abPairs[2] == 0
4116 && pd80Val->s.abPairs[3] == 0
4117 && pd80Val->s.abPairs[4] == 0
4118 && pd80Val->s.abPairs[5] == 0
4119 && pd80Val->s.abPairs[6] == 0
4120 && pd80Val->s.abPairs[7] == 0
4121 && pd80Val->s.abPairs[8] == 0)
4122 {
4123 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4124 pFpuRes->r80Result.s.uExponent = 0;
4125 pFpuRes->r80Result.s.uMantissa = 0;
4126 }
4127 else
4128 {
4129 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4130
4131 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4132 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4133 cPairs--;
4134
4135 uint64_t uVal = 0;
4136 uint64_t uFactor = 1;
4137 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4138 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4139 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4140
4141 unsigned const cBits = ASMBitLastSetU64(uVal);
4142 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4143 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4144 }
4145}
4146
4147
4148/*********************************************************************************************************************************
4149* x87 FPU Stores *
4150*********************************************************************************************************************************/
4151
4152/**
4153 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4154 *
4155 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4156 *
4157 * @returns Updated FPU status word value.
4158 * @param fSignIn Incoming sign indicator.
4159 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4160 * @param iExponentIn Unbiased exponent.
4161 * @param fFcw The FPU control word.
4162 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4163 * @param pr32Dst Where to return the output value, if one should be
4164 * returned.
4165 *
4166 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4167 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4168 */
4169static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4170 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4171{
4172 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4173 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4174 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4175 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4176 ? fRoundingOffMask
4177 : 0;
4178 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4179
4180 /*
4181 * Deal with potential overflows/underflows first, optimizing for none.
4182 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4183 */
4184 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4185 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4186 { /* likely? */ }
4187 /*
4188 * Underflow if the exponent zero or negative. This is attempted mapped
4189 * to a subnormal number when possible, with some additional trickery ofc.
4190 */
4191 else if (iExponentOut <= 0)
4192 {
4193 bool const fIsTiny = iExponentOut < 0
4194 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4195 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4196 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4197 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4198
4199 if (iExponentOut <= 0)
4200 {
4201 uMantissaIn = iExponentOut <= -63
4202 ? uMantissaIn != 0
4203 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4204 fRoundedOff = uMantissaIn & fRoundingOffMask;
4205 if (fRoundedOff && fIsTiny)
4206 fFsw |= X86_FSW_UE;
4207 iExponentOut = 0;
4208 }
4209 }
4210 /*
4211 * Overflow if at or above max exponent value or if we will reach max
4212 * when rounding. Will return +/-zero or +/-max value depending on
4213 * whether we're rounding or not.
4214 */
4215 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4216 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4217 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4218 {
4219 fFsw |= X86_FSW_OE;
4220 if (!(fFcw & X86_FCW_OM))
4221 return fFsw | X86_FSW_ES | X86_FSW_B;
4222 fFsw |= X86_FSW_PE;
4223 if (uRoundingAdd)
4224 fFsw |= X86_FSW_C1;
4225 if (!(fFcw & X86_FCW_PM))
4226 fFsw |= X86_FSW_ES | X86_FSW_B;
4227
4228 pr32Dst->s.fSign = fSignIn;
4229 if (uRoundingAdd)
4230 { /* Zero */
4231 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4232 pr32Dst->s.uFraction = 0;
4233 }
4234 else
4235 { /* Max */
4236 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4237 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4238 }
4239 return fFsw;
4240 }
4241
4242 /*
4243 * Normal or subnormal number.
4244 */
4245 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4246 uint64_t uMantissaOut = uMantissaIn;
4247 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4248 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4249 || fRoundedOff != uRoundingAdd)
4250 {
4251 uMantissaOut = uMantissaIn + uRoundingAdd;
4252 if (uMantissaOut >= uMantissaIn)
4253 { /* likely */ }
4254 else
4255 {
4256 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4257 iExponentOut++;
4258 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4259 fFsw |= X86_FSW_C1;
4260 }
4261 }
4262 else
4263 uMantissaOut = uMantissaIn;
4264
4265 /* Truncate the mantissa and set the return value. */
4266 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4267
4268 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4269 pr32Dst->s.uExponent = iExponentOut;
4270 pr32Dst->s.fSign = fSignIn;
4271
4272 /* Set status flags realted to rounding. */
4273 if (fRoundedOff)
4274 {
4275 fFsw |= X86_FSW_PE;
4276 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4277 fFsw |= X86_FSW_C1;
4278 if (!(fFcw & X86_FCW_PM))
4279 fFsw |= X86_FSW_ES | X86_FSW_B;
4280 }
4281
4282 return fFsw;
4283}
4284
4285
4286/**
4287 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4288 */
4289IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4290 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4291{
4292 uint16_t const fFcw = pFpuState->FCW;
4293 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4294 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4295 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4296 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4297 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4298 {
4299 pr32Dst->s.fSign = pr80Src->s.fSign;
4300 pr32Dst->s.uExponent = 0;
4301 pr32Dst->s.uFraction = 0;
4302 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4303 }
4304 else if (RTFLOAT80U_IS_INF(pr80Src))
4305 {
4306 pr32Dst->s.fSign = pr80Src->s.fSign;
4307 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4308 pr32Dst->s.uFraction = 0;
4309 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4310 }
4311 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4312 {
4313 /* Mapped to +/-QNaN */
4314 pr32Dst->s.fSign = pr80Src->s.fSign;
4315 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4316 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4317 }
4318 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4319 {
4320 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4321 if (fFcw & X86_FCW_IM)
4322 {
4323 pr32Dst->s.fSign = 1;
4324 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4325 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4326 fFsw |= X86_FSW_IE;
4327 }
4328 else
4329 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4330 }
4331 else if (RTFLOAT80U_IS_NAN(pr80Src))
4332 {
4333 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4334 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4335 {
4336 pr32Dst->s.fSign = pr80Src->s.fSign;
4337 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4338 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4339 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4340 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4341 fFsw |= X86_FSW_IE;
4342 }
4343 else
4344 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4345 }
4346 else
4347 {
4348 /* Denormal values causes both an underflow and precision exception. */
4349 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4350 if (fFcw & X86_FCW_UM)
4351 {
4352 pr32Dst->s.fSign = pr80Src->s.fSign;
4353 pr32Dst->s.uExponent = 0;
4354 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4355 {
4356 pr32Dst->s.uFraction = 1;
4357 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4358 if (!(fFcw & X86_FCW_PM))
4359 fFsw |= X86_FSW_ES | X86_FSW_B;
4360 }
4361 else
4362 {
4363 pr32Dst->s.uFraction = 0;
4364 fFsw |= X86_FSW_UE | X86_FSW_PE;
4365 if (!(fFcw & X86_FCW_PM))
4366 fFsw |= X86_FSW_ES | X86_FSW_B;
4367 }
4368 }
4369 else
4370 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4371 }
4372 *pu16FSW = fFsw;
4373}
4374
4375
4376/**
4377 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4378 *
4379 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4380 *
4381 * @returns Updated FPU status word value.
4382 * @param fSignIn Incoming sign indicator.
4383 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4384 * @param iExponentIn Unbiased exponent.
4385 * @param fFcw The FPU control word.
4386 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4387 * @param pr64Dst Where to return the output value, if one should be
4388 * returned.
4389 *
4390 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4391 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4392 */
4393static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4394 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4395{
4396 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4397 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4398 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4399 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4400 ? fRoundingOffMask
4401 : 0;
4402 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4403
4404 /*
4405 * Deal with potential overflows/underflows first, optimizing for none.
4406 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4407 */
4408 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4409 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4410 { /* likely? */ }
4411 /*
4412 * Underflow if the exponent zero or negative. This is attempted mapped
4413 * to a subnormal number when possible, with some additional trickery ofc.
4414 */
4415 else if (iExponentOut <= 0)
4416 {
4417 bool const fIsTiny = iExponentOut < 0
4418 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4419 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4420 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4421 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4422
4423 if (iExponentOut <= 0)
4424 {
4425 uMantissaIn = iExponentOut <= -63
4426 ? uMantissaIn != 0
4427 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4428 fRoundedOff = uMantissaIn & fRoundingOffMask;
4429 if (fRoundedOff && fIsTiny)
4430 fFsw |= X86_FSW_UE;
4431 iExponentOut = 0;
4432 }
4433 }
4434 /*
4435 * Overflow if at or above max exponent value or if we will reach max
4436 * when rounding. Will return +/-zero or +/-max value depending on
4437 * whether we're rounding or not.
4438 */
4439 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4440 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4441 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4442 {
4443 fFsw |= X86_FSW_OE;
4444 if (!(fFcw & X86_FCW_OM))
4445 return fFsw | X86_FSW_ES | X86_FSW_B;
4446 fFsw |= X86_FSW_PE;
4447 if (uRoundingAdd)
4448 fFsw |= X86_FSW_C1;
4449 if (!(fFcw & X86_FCW_PM))
4450 fFsw |= X86_FSW_ES | X86_FSW_B;
4451
4452 pr64Dst->s64.fSign = fSignIn;
4453 if (uRoundingAdd)
4454 { /* Zero */
4455 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4456 pr64Dst->s64.uFraction = 0;
4457 }
4458 else
4459 { /* Max */
4460 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4461 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4462 }
4463 return fFsw;
4464 }
4465
4466 /*
4467 * Normal or subnormal number.
4468 */
4469 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4470 uint64_t uMantissaOut = uMantissaIn;
4471 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4472 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4473 || fRoundedOff != uRoundingAdd)
4474 {
4475 uMantissaOut = uMantissaIn + uRoundingAdd;
4476 if (uMantissaOut >= uMantissaIn)
4477 { /* likely */ }
4478 else
4479 {
4480 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4481 iExponentOut++;
4482 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4483 fFsw |= X86_FSW_C1;
4484 }
4485 }
4486 else
4487 uMantissaOut = uMantissaIn;
4488
4489 /* Truncate the mantissa and set the return value. */
4490 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4491
4492 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4493 pr64Dst->s64.uExponent = iExponentOut;
4494 pr64Dst->s64.fSign = fSignIn;
4495
4496 /* Set status flags realted to rounding. */
4497 if (fRoundedOff)
4498 {
4499 fFsw |= X86_FSW_PE;
4500 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4501 fFsw |= X86_FSW_C1;
4502 if (!(fFcw & X86_FCW_PM))
4503 fFsw |= X86_FSW_ES | X86_FSW_B;
4504 }
4505
4506 return fFsw;
4507}
4508
4509
4510/**
4511 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4512 */
4513IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4514 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4515{
4516 uint16_t const fFcw = pFpuState->FCW;
4517 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4518 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4519 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4520 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4521 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4522 {
4523 pr64Dst->s64.fSign = pr80Src->s.fSign;
4524 pr64Dst->s64.uExponent = 0;
4525 pr64Dst->s64.uFraction = 0;
4526 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4527 }
4528 else if (RTFLOAT80U_IS_INF(pr80Src))
4529 {
4530 pr64Dst->s64.fSign = pr80Src->s.fSign;
4531 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4532 pr64Dst->s64.uFraction = 0;
4533 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4534 }
4535 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4536 {
4537 /* Mapped to +/-QNaN */
4538 pr64Dst->s64.fSign = pr80Src->s.fSign;
4539 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4540 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4541 }
4542 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4543 {
4544 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4545 if (fFcw & X86_FCW_IM)
4546 {
4547 pr64Dst->s64.fSign = 1;
4548 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4549 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4550 fFsw |= X86_FSW_IE;
4551 }
4552 else
4553 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4554 }
4555 else if (RTFLOAT80U_IS_NAN(pr80Src))
4556 {
4557 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4558 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4559 {
4560 pr64Dst->s64.fSign = pr80Src->s.fSign;
4561 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4562 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4563 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4564 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4565 fFsw |= X86_FSW_IE;
4566 }
4567 else
4568 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4569 }
4570 else
4571 {
4572 /* Denormal values causes both an underflow and precision exception. */
4573 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4574 if (fFcw & X86_FCW_UM)
4575 {
4576 pr64Dst->s64.fSign = pr80Src->s.fSign;
4577 pr64Dst->s64.uExponent = 0;
4578 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4579 {
4580 pr64Dst->s64.uFraction = 1;
4581 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4582 if (!(fFcw & X86_FCW_PM))
4583 fFsw |= X86_FSW_ES | X86_FSW_B;
4584 }
4585 else
4586 {
4587 pr64Dst->s64.uFraction = 0;
4588 fFsw |= X86_FSW_UE | X86_FSW_PE;
4589 if (!(fFcw & X86_FCW_PM))
4590 fFsw |= X86_FSW_ES | X86_FSW_B;
4591 }
4592 }
4593 else
4594 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4595 }
4596 *pu16FSW = fFsw;
4597}
4598
4599
4600IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4601 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4602{
4603 /*
4604 * FPU status word:
4605 * - TOP is irrelevant, but we must match x86 assembly version (0).
4606 * - C1 is always cleared as we don't have any stack overflows.
4607 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4608 */
4609 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4610 *pr80Dst = *pr80Src;
4611}
4612
4613
4614/*
4615 *
4616 * Mantissa:
4617 * 63 56 48 40 32 24 16 8 0
4618 * v v v v v v v v v
4619 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4620 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4621 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4622 *
4623 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4624 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4625 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4626 * where we'll drop off all but bit 63.
4627 */
4628#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4629IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBits - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4649 ? RT_BIT_64(cShiftOff - 1) \
4650 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4651 ? fRoundingOffMask \
4652 : 0; \
4653 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4654 \
4655 uMantissa >>= cShiftOff; \
4656 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4657 uMantissa += uRounding; \
4658 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4659 { \
4660 if (fRoundedOff) \
4661 { \
4662 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4663 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4664 else if (uRounding) \
4665 fFsw |= X86_FSW_C1; \
4666 fFsw |= X86_FSW_PE; \
4667 if (!(fFcw & X86_FCW_PM)) \
4668 fFsw |= X86_FSW_ES | X86_FSW_B; \
4669 } \
4670 \
4671 if (!fSignIn) \
4672 *piDst = (a_iType)uMantissa; \
4673 else \
4674 *piDst = -(a_iType)uMantissa; \
4675 } \
4676 else \
4677 { \
4678 /* overflowed after rounding. */ \
4679 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4680 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4681 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4682 \
4683 /* Special case for the integer minimum value. */ \
4684 if (fSignIn) \
4685 { \
4686 *piDst = a_iTypeMin; \
4687 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4688 if (!(fFcw & X86_FCW_PM)) \
4689 fFsw |= X86_FSW_ES | X86_FSW_B; \
4690 } \
4691 else \
4692 { \
4693 fFsw |= X86_FSW_IE; \
4694 if (fFcw & X86_FCW_IM) \
4695 *piDst = a_iTypeMin; \
4696 else \
4697 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4698 } \
4699 } \
4700 } \
4701 /* \
4702 * Tiny sub-zero numbers. \
4703 */ \
4704 else if (iExponent < 0) \
4705 { \
4706 if (!fSignIn) \
4707 { \
4708 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4709 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4710 { \
4711 *piDst = 1; \
4712 fFsw |= X86_FSW_C1; \
4713 } \
4714 else \
4715 *piDst = 0; \
4716 } \
4717 else \
4718 { \
4719 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4720 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4721 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4722 *piDst = 0; \
4723 else \
4724 { \
4725 *piDst = -1; \
4726 fFsw |= X86_FSW_C1; \
4727 } \
4728 } \
4729 fFsw |= X86_FSW_PE; \
4730 if (!(fFcw & X86_FCW_PM)) \
4731 fFsw |= X86_FSW_ES | X86_FSW_B; \
4732 } \
4733 /* \
4734 * Special MIN case. \
4735 */ \
4736 else if ( fSignIn && iExponent == a_cBits - 1 \
4737 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4738 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4739 : uMantissa == RT_BIT_64(63))) \
4740 { \
4741 *piDst = a_iTypeMin; \
4742 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4743 { \
4744 fFsw |= X86_FSW_PE; \
4745 if (!(fFcw & X86_FCW_PM)) \
4746 fFsw |= X86_FSW_ES | X86_FSW_B; \
4747 } \
4748 } \
4749 /* \
4750 * Too large/small number outside the target integer range. \
4751 */ \
4752 else \
4753 { \
4754 fFsw |= X86_FSW_IE; \
4755 if (fFcw & X86_FCW_IM) \
4756 *piDst = a_iTypeIndefinite; \
4757 else \
4758 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4759 } \
4760 } \
4761 /* \
4762 * Map both +0 and -0 to integer zero (signless/+). \
4763 */ \
4764 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4765 *piDst = 0; \
4766 /* \
4767 * Denormals are just really tiny sub-zero numbers that are either rounded \
4768 * to zero, 1 or -1 depending on sign and rounding control. \
4769 */ \
4770 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4771 { \
4772 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4773 *piDst = 0; \
4774 else \
4775 { \
4776 *piDst = fSignIn ? -1 : 1; \
4777 fFsw |= X86_FSW_C1; \
4778 } \
4779 fFsw |= X86_FSW_PE; \
4780 if (!(fFcw & X86_FCW_PM)) \
4781 fFsw |= X86_FSW_ES | X86_FSW_B; \
4782 } \
4783 /* \
4784 * All other special values are considered invalid arguments and result \
4785 * in an IE exception and indefinite value if masked. \
4786 */ \
4787 else \
4788 { \
4789 fFsw |= X86_FSW_IE; \
4790 if (fFcw & X86_FCW_IM) \
4791 *piDst = a_iTypeIndefinite; \
4792 else \
4793 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4794 } \
4795 *pu16FSW = fFsw; \
4796}
4797EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4798EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4799EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4800
4801#endif /*IEM_WITHOUT_ASSEMBLY */
4802
4803
4804/*
4805 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4806 *
4807 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4808 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4809 * thus the @a a_cBitsIn.
4810 */
4811#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4812IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4813 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4814{ \
4815 uint16_t const fFcw = pFpuState->FCW; \
4816 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4817 bool const fSignIn = pr80Val->s.fSign; \
4818 \
4819 /* \
4820 * Deal with normal numbers first. \
4821 */ \
4822 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4823 { \
4824 uint64_t uMantissa = pr80Val->s.uMantissa; \
4825 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4826 \
4827 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4828 { \
4829 unsigned const cShiftOff = 63 - iExponent; \
4830 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4831 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4832 uMantissa >>= cShiftOff; \
4833 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4834 if (!fSignIn) \
4835 *piDst = (a_iType)uMantissa; \
4836 else \
4837 *piDst = -(a_iType)uMantissa; \
4838 \
4839 if (fRoundedOff) \
4840 { \
4841 fFsw |= X86_FSW_PE; \
4842 if (!(fFcw & X86_FCW_PM)) \
4843 fFsw |= X86_FSW_ES | X86_FSW_B; \
4844 } \
4845 } \
4846 /* \
4847 * Tiny sub-zero numbers. \
4848 */ \
4849 else if (iExponent < 0) \
4850 { \
4851 *piDst = 0; \
4852 fFsw |= X86_FSW_PE; \
4853 if (!(fFcw & X86_FCW_PM)) \
4854 fFsw |= X86_FSW_ES | X86_FSW_B; \
4855 } \
4856 /* \
4857 * Special MIN case. \
4858 */ \
4859 else if ( fSignIn && iExponent == a_cBits - 1 \
4860 && (a_cBits < 64 \
4861 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4862 : uMantissa == RT_BIT_64(63)) ) \
4863 { \
4864 *piDst = a_iTypeMin; \
4865 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4866 { \
4867 fFsw |= X86_FSW_PE; \
4868 if (!(fFcw & X86_FCW_PM)) \
4869 fFsw |= X86_FSW_ES | X86_FSW_B; \
4870 } \
4871 } \
4872 /* \
4873 * Figure this weirdness. \
4874 */ \
4875 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4876 { \
4877 *piDst = 0; \
4878 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4879 { \
4880 fFsw |= X86_FSW_PE; \
4881 if (!(fFcw & X86_FCW_PM)) \
4882 fFsw |= X86_FSW_ES | X86_FSW_B; \
4883 } \
4884 } \
4885 /* \
4886 * Too large/small number outside the target integer range. \
4887 */ \
4888 else \
4889 { \
4890 fFsw |= X86_FSW_IE; \
4891 if (fFcw & X86_FCW_IM) \
4892 *piDst = a_iTypeIndefinite; \
4893 else \
4894 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4895 } \
4896 } \
4897 /* \
4898 * Map both +0 and -0 to integer zero (signless/+). \
4899 */ \
4900 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4901 *piDst = 0; \
4902 /* \
4903 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4904 */ \
4905 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4906 { \
4907 *piDst = 0; \
4908 fFsw |= X86_FSW_PE; \
4909 if (!(fFcw & X86_FCW_PM)) \
4910 fFsw |= X86_FSW_ES | X86_FSW_B; \
4911 } \
4912 /* \
4913 * All other special values are considered invalid arguments and result \
4914 * in an IE exception and indefinite value if masked. \
4915 */ \
4916 else \
4917 { \
4918 fFsw |= X86_FSW_IE; \
4919 if (fFcw & X86_FCW_IM) \
4920 *piDst = a_iTypeIndefinite; \
4921 else \
4922 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4923 } \
4924 *pu16FSW = fFsw; \
4925}
4926#if defined(IEM_WITHOUT_ASSEMBLY)
4927EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4928EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4929EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4930#endif
4931EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4932EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4933
4934
4935#if defined(IEM_WITHOUT_ASSEMBLY)
4936
4937IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4938 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4939{
4940 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4941 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4942 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4943 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4944 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4945
4946 uint16_t const fFcw = pFpuState->FCW;
4947 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4948 bool const fSignIn = pr80Src->s.fSign;
4949
4950 /*
4951 * Deal with normal numbers first.
4952 */
4953 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4954 {
4955 uint64_t uMantissa = pr80Src->s.uMantissa;
4956 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4957 if ( (uint32_t)iExponent <= 58
4958 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4959 {
4960 unsigned const cShiftOff = 63 - iExponent;
4961 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4962 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4963 ? RT_BIT_64(cShiftOff - 1)
4964 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4965 ? fRoundingOffMask
4966 : 0;
4967 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4968
4969 uMantissa >>= cShiftOff;
4970 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4971 uMantissa += uRounding;
4972 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4973 {
4974 if (fRoundedOff)
4975 {
4976 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4977 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4978 else if (uRounding)
4979 fFsw |= X86_FSW_C1;
4980 fFsw |= X86_FSW_PE;
4981 if (!(fFcw & X86_FCW_PM))
4982 fFsw |= X86_FSW_ES | X86_FSW_B;
4983 }
4984
4985 pd80Dst->s.fSign = fSignIn;
4986 pd80Dst->s.uPad = 0;
4987 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4988 {
4989 unsigned const uDigits = uMantissa % 100;
4990 uMantissa /= 100;
4991 uint8_t const bLo = uDigits % 10;
4992 uint8_t const bHi = uDigits / 10;
4993 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4994 }
4995 }
4996 else
4997 {
4998 /* overflowed after rounding. */
4999 fFsw |= X86_FSW_IE;
5000 if (fFcw & X86_FCW_IM)
5001 *pd80Dst = s_d80Indefinite;
5002 else
5003 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5004 }
5005 }
5006 /*
5007 * Tiny sub-zero numbers.
5008 */
5009 else if (iExponent < 0)
5010 {
5011 if (!fSignIn)
5012 {
5013 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5014 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5015 {
5016 *pd80Dst = s_ad80One[fSignIn];
5017 fFsw |= X86_FSW_C1;
5018 }
5019 else
5020 *pd80Dst = s_ad80Zeros[fSignIn];
5021 }
5022 else
5023 {
5024 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5025 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5026 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5027 *pd80Dst = s_ad80Zeros[fSignIn];
5028 else
5029 {
5030 *pd80Dst = s_ad80One[fSignIn];
5031 fFsw |= X86_FSW_C1;
5032 }
5033 }
5034 fFsw |= X86_FSW_PE;
5035 if (!(fFcw & X86_FCW_PM))
5036 fFsw |= X86_FSW_ES | X86_FSW_B;
5037 }
5038 /*
5039 * Too large/small number outside the target integer range.
5040 */
5041 else
5042 {
5043 fFsw |= X86_FSW_IE;
5044 if (fFcw & X86_FCW_IM)
5045 *pd80Dst = s_d80Indefinite;
5046 else
5047 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5048 }
5049 }
5050 /*
5051 * Map both +0 and -0 to integer zero (signless/+).
5052 */
5053 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5054 *pd80Dst = s_ad80Zeros[fSignIn];
5055 /*
5056 * Denormals are just really tiny sub-zero numbers that are either rounded
5057 * to zero, 1 or -1 depending on sign and rounding control.
5058 */
5059 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5060 {
5061 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5062 *pd80Dst = s_ad80Zeros[fSignIn];
5063 else
5064 {
5065 *pd80Dst = s_ad80One[fSignIn];
5066 fFsw |= X86_FSW_C1;
5067 }
5068 fFsw |= X86_FSW_PE;
5069 if (!(fFcw & X86_FCW_PM))
5070 fFsw |= X86_FSW_ES | X86_FSW_B;
5071 }
5072 /*
5073 * All other special values are considered invalid arguments and result
5074 * in an IE exception and indefinite value if masked.
5075 */
5076 else
5077 {
5078 fFsw |= X86_FSW_IE;
5079 if (fFcw & X86_FCW_IM)
5080 *pd80Dst = s_d80Indefinite;
5081 else
5082 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5083 }
5084 *pu16FSW = fFsw;
5085}
5086
5087
5088/*********************************************************************************************************************************
5089* FPU Helpers *
5090*********************************************************************************************************************************/
5091AssertCompileSize(RTFLOAT128U, 16);
5092AssertCompileSize(RTFLOAT80U, 10);
5093AssertCompileSize(RTFLOAT64U, 8);
5094AssertCompileSize(RTFLOAT32U, 4);
5095
5096/**
5097 * Normalizes a possible pseudo-normal value.
5098 *
5099 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5100 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5101 * i.e. changing uExponent from 0 to 1.
5102 *
5103 * This macro will declare a RTFLOAT80U with the name given by
5104 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5105 * a normalization was performed.
5106 *
5107 * @note This must be applied before calling SoftFloat with a value that couldbe
5108 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5109 * correctly.
5110 */
5111#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5112 RTFLOAT80U a_r80ValNormalized; \
5113 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5114 { \
5115 a_r80ValNormalized = *a_pr80Val; \
5116 a_r80ValNormalized.s.uExponent = 1; \
5117 a_pr80Val = &a_r80ValNormalized; \
5118 } else do {} while (0)
5119
5120#ifdef IEM_WITH_FLOAT128_FOR_FPU
5121
5122DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5123{
5124 int fNew;
5125 switch (fFcw & X86_FCW_RC_MASK)
5126 {
5127 default:
5128 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5129 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5130 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5131 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5132 }
5133 int fOld = fegetround();
5134 fesetround(fNew);
5135 return fOld;
5136}
5137
5138
5139DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5140{
5141 fesetround(fOld);
5142}
5143
5144DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5145{
5146 RT_NOREF(fFcw);
5147 RTFLOAT128U Tmp;
5148 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5149 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5150 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5151 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5152 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5153 {
5154 Assert(Tmp.s.uExponent == 0);
5155 Tmp.s2.uSignAndExponent++;
5156 }
5157 return *(_Float128 *)&Tmp;
5158}
5159
5160
5161DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5162{
5163 RT_NOREF(fFcw);
5164 RTFLOAT128U Tmp;
5165 *(_Float128 *)&Tmp = rd128ValSrc;
5166 ASMCompilerBarrier();
5167 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5168 {
5169 pr80Dst->s.fSign = Tmp.s64.fSign;
5170 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5171 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5172 | Tmp.s64.uFractionLo >> (64 - 15);
5173
5174 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5175 unsigned const cShiftOff = 64 - 15;
5176 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5177 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5178 if (uRoundedOff)
5179 {
5180 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5181 ? RT_BIT_64(cShiftOff - 1)
5182 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5183 ? fRoundingOffMask
5184 : 0;
5185 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5186 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5187 || uRoundedOff != uRoundingAdd)
5188 {
5189 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5190 {
5191 uFraction += 1;
5192 if (!(uFraction & RT_BIT_64(63)))
5193 { /* likely */ }
5194 else
5195 {
5196 uFraction >>= 1;
5197 pr80Dst->s.uExponent++;
5198 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5199 return fFsw;
5200 }
5201 fFsw |= X86_FSW_C1;
5202 }
5203 }
5204 fFsw |= X86_FSW_PE;
5205 if (!(fFcw & X86_FCW_PM))
5206 fFsw |= X86_FSW_ES | X86_FSW_B;
5207 }
5208 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5209 }
5210 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5211 {
5212 pr80Dst->s.fSign = Tmp.s64.fSign;
5213 pr80Dst->s.uExponent = 0;
5214 pr80Dst->s.uMantissa = 0;
5215 }
5216 else if (RTFLOAT128U_IS_INF(&Tmp))
5217 {
5218 pr80Dst->s.fSign = Tmp.s64.fSign;
5219 pr80Dst->s.uExponent = 0;
5220 pr80Dst->s.uMantissa = 0;
5221 }
5222 return fFsw;
5223}
5224
5225
5226#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5227
5228/** Initializer for the SoftFloat state structure. */
5229# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5230 { \
5231 softfloat_tininess_afterRounding, \
5232 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5233 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5234 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5235 : (uint8_t)softfloat_round_minMag, \
5236 0, \
5237 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5238 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5239 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5240 }
5241
5242/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5243# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5244 ( (a_fFsw) \
5245 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5246 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5247 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5248 ? X86_FSW_ES | X86_FSW_B : 0) )
5249
5250
5251DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5252{
5253 RT_NOREF(fFcw);
5254 Assert(cBits > 64);
5255# if 0 /* rounding does not seem to help */
5256 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5257 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5258 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5259 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5260 {
5261 uint64_t uOld = r128.v[0];
5262 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5263 if (r128.v[0] < uOld)
5264 r128.v[1] += 1;
5265 }
5266# else
5267 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5268# endif
5269 return r128;
5270}
5271
5272
5273DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5274{
5275 RT_NOREF(fFcw);
5276 Assert(cBits > 64);
5277# if 0 /* rounding does not seem to help, not even on constants */
5278 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5279 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5280 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5281 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5282 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5283 {
5284 uint64_t uOld = r128.v[0];
5285 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5286 if (r128.v[0] < uOld)
5287 r128.v[1] += 1;
5288 }
5289 return r128;
5290# else
5291 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5292 return r128;
5293# endif
5294}
5295
5296
5297# if 0 /* unused */
5298DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5299{
5300 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5301 return r128;
5302}
5303# endif
5304
5305
5306/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5307DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5308{
5309 extFloat80_t Tmp;
5310 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5311 Tmp.signif = pr80Val->s2.uMantissa;
5312 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5313 return extF80_to_f128(Tmp, &Ignored);
5314}
5315
5316
5317/**
5318 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5319 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5320 *
5321 * This is only a structure format conversion, nothing else.
5322 */
5323DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5324{
5325 extFloat80_t Tmp;
5326 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5327 Tmp.signif = pr80Val->s2.uMantissa;
5328 return Tmp;
5329}
5330
5331
5332/**
5333 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5334 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5335 *
5336 * This is only a structure format conversion, nothing else.
5337 */
5338DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5339{
5340 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5341 pr80Dst->s2.uMantissa = r80XSrc.signif;
5342 return pr80Dst;
5343}
5344
5345
5346DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5347{
5348 RT_NOREF(fFcw);
5349 RTFLOAT128U Tmp;
5350 *(float128_t *)&Tmp = r128Src;
5351 ASMCompilerBarrier();
5352
5353 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5354 {
5355 pr80Dst->s.fSign = Tmp.s64.fSign;
5356 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5357 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5358 | Tmp.s64.uFractionLo >> (64 - 15);
5359
5360 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5361 unsigned const cShiftOff = 64 - 15;
5362 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5363 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5364 if (uRoundedOff)
5365 {
5366 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5367 ? RT_BIT_64(cShiftOff - 1)
5368 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5369 ? fRoundingOffMask
5370 : 0;
5371 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5372 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5373 || uRoundedOff != uRoundingAdd)
5374 {
5375 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5376 {
5377 uFraction += 1;
5378 if (!(uFraction & RT_BIT_64(63)))
5379 { /* likely */ }
5380 else
5381 {
5382 uFraction >>= 1;
5383 pr80Dst->s.uExponent++;
5384 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5385 return fFsw;
5386 }
5387 fFsw |= X86_FSW_C1;
5388 }
5389 }
5390 fFsw |= X86_FSW_PE;
5391 if (!(fFcw & X86_FCW_PM))
5392 fFsw |= X86_FSW_ES | X86_FSW_B;
5393 }
5394
5395 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5396 }
5397 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5398 {
5399 pr80Dst->s.fSign = Tmp.s64.fSign;
5400 pr80Dst->s.uExponent = 0;
5401 pr80Dst->s.uMantissa = 0;
5402 }
5403 else if (RTFLOAT128U_IS_INF(&Tmp))
5404 {
5405 pr80Dst->s.fSign = Tmp.s64.fSign;
5406 pr80Dst->s.uExponent = 0x7fff;
5407 pr80Dst->s.uMantissa = 0;
5408 }
5409 return fFsw;
5410}
5411
5412
5413/**
5414 * Helper for transfering exception and C1 to FSW and setting the result value
5415 * accordingly.
5416 *
5417 * @returns Updated FSW.
5418 * @param pSoftState The SoftFloat state following the operation.
5419 * @param r80XResult The result of the SoftFloat operation.
5420 * @param pr80Result Where to store the result for IEM.
5421 * @param fFcw The FPU control word.
5422 * @param fFsw The FSW before the operation, with necessary bits
5423 * cleared and such.
5424 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5425 * raised.
5426 */
5427DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5428 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5429 PCRTFLOAT80U pr80XcptResult)
5430{
5431 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5432 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5433 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5434 fFsw |= X86_FSW_ES | X86_FSW_B;
5435
5436 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5437 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5438 else
5439 {
5440 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5441 *pr80Result = *pr80XcptResult;
5442 }
5443 return fFsw;
5444}
5445
5446
5447/**
5448 * Helper doing polynomial evaluation using Horner's method.
5449 *
5450 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5451 */
5452float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5453 unsigned cPrecision, softfloat_state_t *pSoftState)
5454{
5455 Assert(cHornerConsts > 1);
5456 size_t i = cHornerConsts - 1;
5457 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5458 while (i-- > 0)
5459 {
5460 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5461 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5462 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5463 }
5464 return r128Result;
5465}
5466
5467#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5468
5469
5470/**
5471 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5472 * mantissa, exponent and sign.
5473 *
5474 * @returns Updated FSW.
5475 * @param pr80Dst Where to return the composed value.
5476 * @param fSign The sign.
5477 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5478 * ignored and should be zero. This will probably be
5479 * modified during normalization and rounding.
5480 * @param iExponent Unbiased exponent.
5481 * @param fFcw The FPU control word.
5482 * @param fFsw The FPU status word.
5483 */
5484static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5485 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5486{
5487 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5488
5489 iExponent += RTFLOAT80U_EXP_BIAS;
5490
5491 /* Do normalization if necessary and possible. */
5492 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5493 {
5494 int cShift = 192 - RTUInt256BitCount(puMantissa);
5495 if (iExponent > cShift)
5496 iExponent -= cShift;
5497 else
5498 {
5499 if (fFcw & X86_FCW_UM)
5500 {
5501 if (iExponent > 0)
5502 cShift = --iExponent;
5503 else
5504 cShift = 0;
5505 }
5506 iExponent -= cShift;
5507 }
5508 RTUInt256AssignShiftLeft(puMantissa, cShift);
5509 }
5510
5511 /* Do rounding. */
5512 uint64_t uMantissa = puMantissa->QWords.qw2;
5513 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5514 {
5515 bool fAdd;
5516 switch (fFcw & X86_FCW_RC_MASK)
5517 {
5518 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5519 case X86_FCW_RC_NEAREST:
5520 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5521 {
5522 if ( (uMantissa & 1)
5523 || puMantissa->QWords.qw0 != 0
5524 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5525 {
5526 fAdd = true;
5527 break;
5528 }
5529 uMantissa &= ~(uint64_t)1;
5530 }
5531 fAdd = false;
5532 break;
5533 case X86_FCW_RC_ZERO:
5534 fAdd = false;
5535 break;
5536 case X86_FCW_RC_UP:
5537 fAdd = !fSign;
5538 break;
5539 case X86_FCW_RC_DOWN:
5540 fAdd = fSign;
5541 break;
5542 }
5543 if (fAdd)
5544 {
5545 uint64_t const uTmp = uMantissa;
5546 uMantissa = uTmp + 1;
5547 if (uMantissa < uTmp)
5548 {
5549 uMantissa >>= 1;
5550 uMantissa |= RT_BIT_64(63);
5551 iExponent++;
5552 }
5553 fFsw |= X86_FSW_C1;
5554 }
5555 fFsw |= X86_FSW_PE;
5556 if (!(fFcw & X86_FCW_PM))
5557 fFsw |= X86_FSW_ES | X86_FSW_B;
5558 }
5559
5560 /* Check for underflow (denormals). */
5561 if (iExponent <= 0)
5562 {
5563 if (fFcw & X86_FCW_UM)
5564 {
5565 if (uMantissa & RT_BIT_64(63))
5566 uMantissa >>= 1;
5567 iExponent = 0;
5568 }
5569 else
5570 {
5571 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5572 fFsw |= X86_FSW_ES | X86_FSW_B;
5573 }
5574 fFsw |= X86_FSW_UE;
5575 }
5576 /* Check for overflow */
5577 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5578 {
5579 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5580 }
5581
5582 /* Compose the result. */
5583 pr80Dst->s.uMantissa = uMantissa;
5584 pr80Dst->s.uExponent = iExponent;
5585 pr80Dst->s.fSign = fSign;
5586 return fFsw;
5587}
5588
5589
5590/**
5591 * See also iemAImpl_fld_r80_from_r32
5592 */
5593static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5594{
5595 uint16_t fFsw = 0;
5596 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5597 {
5598 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5599 pr80Dst->sj64.fInteger = 1;
5600 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5601 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5602 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5603 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5604 }
5605 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5606 {
5607 pr80Dst->s.fSign = pr32Val->s.fSign;
5608 pr80Dst->s.uExponent = 0;
5609 pr80Dst->s.uMantissa = 0;
5610 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5611 }
5612 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5613 {
5614 /* Subnormal -> normalized + X86_FSW_DE return. */
5615 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5616 pr80Dst->sj64.fInteger = 1;
5617 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5618 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5619 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5620 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5621 fFsw = X86_FSW_DE;
5622 }
5623 else if (RTFLOAT32U_IS_INF(pr32Val))
5624 {
5625 pr80Dst->s.fSign = pr32Val->s.fSign;
5626 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5627 pr80Dst->s.uMantissa = RT_BIT_64(63);
5628 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5629 }
5630 else
5631 {
5632 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5633 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5634 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5635 pr80Dst->sj64.fInteger = 1;
5636 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5637 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5638 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5639 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5640 }
5641 return fFsw;
5642}
5643
5644
5645/**
5646 * See also iemAImpl_fld_r80_from_r64
5647 */
5648static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5649{
5650 uint16_t fFsw = 0;
5651 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5652 {
5653 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5654 pr80Dst->sj64.fInteger = 1;
5655 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5656 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5657 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5658 }
5659 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5660 {
5661 pr80Dst->s.fSign = pr64Val->s.fSign;
5662 pr80Dst->s.uExponent = 0;
5663 pr80Dst->s.uMantissa = 0;
5664 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5665 }
5666 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5667 {
5668 /* Subnormal values gets normalized. */
5669 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5670 pr80Dst->sj64.fInteger = 1;
5671 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5672 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5673 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5674 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5675 fFsw = X86_FSW_DE;
5676 }
5677 else if (RTFLOAT64U_IS_INF(pr64Val))
5678 {
5679 pr80Dst->s.fSign = pr64Val->s.fSign;
5680 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5681 pr80Dst->s.uMantissa = RT_BIT_64(63);
5682 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5683 }
5684 else
5685 {
5686 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5687 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5688 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5689 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5690 pr80Dst->sj64.fInteger = 1;
5691 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5692 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5693 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5694 }
5695 return fFsw;
5696}
5697
5698
5699/**
5700 * See also EMIT_FILD.
5701 */
5702#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5703static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5704{ \
5705 if (iVal == 0) \
5706 { \
5707 pr80Dst->s.fSign = 0; \
5708 pr80Dst->s.uExponent = 0; \
5709 pr80Dst->s.uMantissa = 0; \
5710 } \
5711 else \
5712 { \
5713 if (iVal > 0) \
5714 pr80Dst->s.fSign = 0; \
5715 else \
5716 { \
5717 pr80Dst->s.fSign = 1; \
5718 iVal = -iVal; \
5719 } \
5720 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5721 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5722 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5723 } \
5724 return pr80Dst; \
5725}
5726EMIT_CONVERT_IXX_TO_R80(16)
5727EMIT_CONVERT_IXX_TO_R80(32)
5728//EMIT_CONVERT_IXX_TO_R80(64)
5729
5730/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5731#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5732IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5733{ \
5734 RTFLOAT80U r80Val2; \
5735 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5736 Assert(!fFsw || fFsw == X86_FSW_DE); \
5737 if (fFsw) \
5738 { \
5739 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5740 fFsw = 0; \
5741 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5742 { \
5743 pFpuRes->r80Result = *pr80Val1; \
5744 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5745 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5746 return; \
5747 } \
5748 } \
5749 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5750 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5751}
5752
5753/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5754#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5755IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5756{ \
5757 RTFLOAT80U r80Val2; \
5758 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5759 Assert(!fFsw || fFsw == X86_FSW_DE); \
5760 if (fFsw) \
5761 { \
5762 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5763 fFsw = 0; \
5764 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5765 { \
5766 pFpuRes->r80Result = *pr80Val1; \
5767 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5768 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5769 return; \
5770 } \
5771 } \
5772 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5773 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5774}
5775
5776/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5777#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5778IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5779{ \
5780 RTFLOAT80U r80Val2; \
5781 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5782 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5783}
5784
5785/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5786#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5787IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5788{ \
5789 RTFLOAT80U r80Val2; \
5790 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5791 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5792}
5793
5794
5795
5796/*********************************************************************************************************************************
5797* x86 FPU Division Operations *
5798*********************************************************************************************************************************/
5799
5800/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5801static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5802 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5803{
5804 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5805 {
5806 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5807 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5808 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5809 }
5810 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5811 { /* Div by zero. */
5812 if (fFcw & X86_FCW_ZM)
5813 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5814 else
5815 {
5816 *pr80Result = *pr80Val1Org;
5817 fFsw |= X86_FSW_ES | X86_FSW_B;
5818 }
5819 fFsw |= X86_FSW_ZE;
5820 }
5821 else
5822 { /* Invalid operand */
5823 if (fFcw & X86_FCW_IM)
5824 *pr80Result = g_r80Indefinite;
5825 else
5826 {
5827 *pr80Result = *pr80Val1Org;
5828 fFsw |= X86_FSW_ES | X86_FSW_B;
5829 }
5830 fFsw |= X86_FSW_IE;
5831 }
5832 return fFsw;
5833}
5834
5835
5836IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5837 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5838{
5839 uint16_t const fFcw = pFpuState->FCW;
5840 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5841
5842 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5843 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5844 {
5845 if (fFcw & X86_FCW_IM)
5846 pFpuRes->r80Result = g_r80Indefinite;
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_IE;
5853 }
5854 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5855 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5856 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5857 {
5858 if (fFcw & X86_FCW_DM)
5859 {
5860 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5861 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5862 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5863 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5864 }
5865 else
5866 {
5867 pFpuRes->r80Result = *pr80Val1;
5868 fFsw |= X86_FSW_ES | X86_FSW_B;
5869 }
5870 fFsw |= X86_FSW_DE;
5871 }
5872 /* SoftFloat can handle the rest: */
5873 else
5874 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5875
5876 pFpuRes->FSW = fFsw;
5877}
5878
5879
5880EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5881EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5882EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5883EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5884
5885
5886IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5887 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5888{
5889 uint16_t const fFcw = pFpuState->FCW;
5890 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5891
5892 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5893 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5894 {
5895 if (fFcw & X86_FCW_IM)
5896 pFpuRes->r80Result = g_r80Indefinite;
5897 else
5898 {
5899 pFpuRes->r80Result = *pr80Val1;
5900 fFsw |= X86_FSW_ES | X86_FSW_B;
5901 }
5902 fFsw |= X86_FSW_IE;
5903 }
5904 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5905 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5906 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5907 {
5908 if (fFcw & X86_FCW_DM)
5909 {
5910 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5911 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5912 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5913 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5914 }
5915 else
5916 {
5917 pFpuRes->r80Result = *pr80Val1;
5918 fFsw |= X86_FSW_ES | X86_FSW_B;
5919 }
5920 fFsw |= X86_FSW_DE;
5921 }
5922 /* SoftFloat can handle the rest: */
5923 else
5924 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5925
5926 pFpuRes->FSW = fFsw;
5927}
5928
5929
5930EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5931EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5932EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5933EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5934
5935
5936/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5937static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5938 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5939{
5940 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5941 {
5942 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5943 uint16_t fCxFlags = 0;
5944 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5945 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5946 &fCxFlags, &SoftState);
5947 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5948 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5949 if ( !(fFsw & X86_FSW_IE)
5950 && !RTFLOAT80U_IS_NAN(pr80Result)
5951 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5952 {
5953 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5954 fFsw |= fCxFlags & X86_FSW_C_MASK;
5955 }
5956 return fFsw;
5957 }
5958
5959 /* Invalid operand */
5960 if (fFcw & X86_FCW_IM)
5961 *pr80Result = g_r80Indefinite;
5962 else
5963 {
5964 *pr80Result = *pr80Val1Org;
5965 fFsw |= X86_FSW_ES | X86_FSW_B;
5966 }
5967 return fFsw | X86_FSW_IE;
5968}
5969
5970
5971static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5972 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5973{
5974 uint16_t const fFcw = pFpuState->FCW;
5975 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5976
5977 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5978 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5979 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5980 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5981 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5982 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5983 {
5984 if (fFcw & X86_FCW_IM)
5985 pFpuRes->r80Result = g_r80Indefinite;
5986 else
5987 {
5988 pFpuRes->r80Result = *pr80Val1;
5989 fFsw |= X86_FSW_ES | X86_FSW_B;
5990 }
5991 fFsw |= X86_FSW_IE;
5992 }
5993 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5994 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5995 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5996 {
5997 if (fFcw & X86_FCW_DM)
5998 {
5999 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6000 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6001 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6002 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6003 pr80Val1Org, fLegacyInstr);
6004 }
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_DE;
6011 }
6012 /* SoftFloat can handle the rest: */
6013 else
6014 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6015 pr80Val1, fLegacyInstr);
6016
6017 pFpuRes->FSW = fFsw;
6018}
6019
6020
6021IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6022 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6023{
6024 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6025}
6026
6027
6028IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6029 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6030{
6031 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6032}
6033
6034
6035/*********************************************************************************************************************************
6036* x87 FPU Multiplication Operations *
6037*********************************************************************************************************************************/
6038
6039/** Worker for iemAImpl_fmul_r80_by_r80. */
6040static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6041 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6042{
6043 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6044 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6045 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6046}
6047
6048
6049IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6050 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6051{
6052 uint16_t const fFcw = pFpuState->FCW;
6053 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6054
6055 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6056 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6057 {
6058 if (fFcw & X86_FCW_IM)
6059 pFpuRes->r80Result = g_r80Indefinite;
6060 else
6061 {
6062 pFpuRes->r80Result = *pr80Val1;
6063 fFsw |= X86_FSW_ES | X86_FSW_B;
6064 }
6065 fFsw |= X86_FSW_IE;
6066 }
6067 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6068 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6069 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6070 {
6071 if (fFcw & X86_FCW_DM)
6072 {
6073 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6074 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6075 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6076 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6077 }
6078 else
6079 {
6080 pFpuRes->r80Result = *pr80Val1;
6081 fFsw |= X86_FSW_ES | X86_FSW_B;
6082 }
6083 fFsw |= X86_FSW_DE;
6084 }
6085 /* SoftFloat can handle the rest: */
6086 else
6087 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6088
6089 pFpuRes->FSW = fFsw;
6090}
6091
6092
6093EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6094EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6095EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6096EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6097
6098
6099/*********************************************************************************************************************************
6100* x87 FPU Addition *
6101*********************************************************************************************************************************/
6102
6103/** Worker for iemAImpl_fadd_r80_by_r80. */
6104static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6105 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6106{
6107 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6108 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6109 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6110}
6111
6112
6113IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6114 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6115{
6116 uint16_t const fFcw = pFpuState->FCW;
6117 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6118
6119 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6120 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6121 {
6122 if (fFcw & X86_FCW_IM)
6123 pFpuRes->r80Result = g_r80Indefinite;
6124 else
6125 {
6126 pFpuRes->r80Result = *pr80Val1;
6127 fFsw |= X86_FSW_ES | X86_FSW_B;
6128 }
6129 fFsw |= X86_FSW_IE;
6130 }
6131 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6132 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6133 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6134 {
6135 if (fFcw & X86_FCW_DM)
6136 {
6137 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6138 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6139 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6140 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6141 }
6142 else
6143 {
6144 pFpuRes->r80Result = *pr80Val1;
6145 fFsw |= X86_FSW_ES | X86_FSW_B;
6146 }
6147 fFsw |= X86_FSW_DE;
6148 }
6149 /* SoftFloat can handle the rest: */
6150 else
6151 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6152
6153 pFpuRes->FSW = fFsw;
6154}
6155
6156
6157EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6158EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6159EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6160EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6161
6162
6163/*********************************************************************************************************************************
6164* x87 FPU Subtraction *
6165*********************************************************************************************************************************/
6166
6167/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6168static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6169 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6170{
6171 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6172 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6173 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6174}
6175
6176
6177IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6178 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6179{
6180 uint16_t const fFcw = pFpuState->FCW;
6181 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6182
6183 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6184 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6185 {
6186 if (fFcw & X86_FCW_IM)
6187 pFpuRes->r80Result = g_r80Indefinite;
6188 else
6189 {
6190 pFpuRes->r80Result = *pr80Val1;
6191 fFsw |= X86_FSW_ES | X86_FSW_B;
6192 }
6193 fFsw |= X86_FSW_IE;
6194 }
6195 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6196 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6197 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6198 {
6199 if (fFcw & X86_FCW_DM)
6200 {
6201 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6202 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6203 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6204 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6205 }
6206 else
6207 {
6208 pFpuRes->r80Result = *pr80Val1;
6209 fFsw |= X86_FSW_ES | X86_FSW_B;
6210 }
6211 fFsw |= X86_FSW_DE;
6212 }
6213 /* SoftFloat can handle the rest: */
6214 else
6215 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6216
6217 pFpuRes->FSW = fFsw;
6218}
6219
6220
6221EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6222EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6223EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6224EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6225
6226
6227/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6229 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6230{
6231 uint16_t const fFcw = pFpuState->FCW;
6232 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6233
6234 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6235 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6236 {
6237 if (fFcw & X86_FCW_IM)
6238 pFpuRes->r80Result = g_r80Indefinite;
6239 else
6240 {
6241 pFpuRes->r80Result = *pr80Val1;
6242 fFsw |= X86_FSW_ES | X86_FSW_B;
6243 }
6244 fFsw |= X86_FSW_IE;
6245 }
6246 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6247 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6248 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6249 {
6250 if (fFcw & X86_FCW_DM)
6251 {
6252 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6253 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6254 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6255 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6256 }
6257 else
6258 {
6259 pFpuRes->r80Result = *pr80Val1;
6260 fFsw |= X86_FSW_ES | X86_FSW_B;
6261 }
6262 fFsw |= X86_FSW_DE;
6263 }
6264 /* SoftFloat can handle the rest: */
6265 else
6266 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6267
6268 pFpuRes->FSW = fFsw;
6269}
6270
6271
6272EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6273EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6274EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6275EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6276
6277
6278/*********************************************************************************************************************************
6279* x87 FPU Trigometric Operations *
6280*********************************************************************************************************************************/
6281static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6282{
6283 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6284 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6285 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6286 extFloat80_t v;
6287 (void)fFcw;
6288
6289 v = extF80_atan2(y, x, &SoftState);
6290
6291 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6292 return fFsw;
6293}
6294
6295IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6296 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6297{
6298 uint16_t const fFcw = pFpuState->FCW;
6299 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6300
6301 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6302 {
6303 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6304
6305 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6306 if (!(fFcw & X86_FCW_PM))
6307 fFsw |= X86_FSW_ES | X86_FSW_B;
6308 }
6309 else
6310 {
6311 fFsw |= X86_FSW_IE;
6312 if (!(fFcw & X86_FCW_IM))
6313 {
6314 pFpuRes->r80Result = *pr80Val2;
6315 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6316 }
6317 else
6318 {
6319 pFpuRes->r80Result = g_r80Indefinite;
6320 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6321 }
6322 }
6323
6324 pFpuRes->FSW = fFsw;
6325}
6326#endif /* IEM_WITHOUT_ASSEMBLY */
6327
6328IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6329 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6330{
6331 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6332}
6333
6334IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6335 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6336{
6337 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6338}
6339
6340
6341#if defined(IEM_WITHOUT_ASSEMBLY)
6342static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6343{
6344 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6345 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6346 extFloat80_t v;
6347 (void)fFcw;
6348
6349 v = extF80_tan(x, &SoftState);
6350
6351 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6352 return fFsw;
6353}
6354
6355IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6356{
6357 uint16_t const fFcw = pFpuState->FCW;
6358 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6359
6360 if (RTFLOAT80U_IS_ZERO(pr80Val))
6361 {
6362 pFpuResTwo->r80Result1 = *pr80Val;
6363 pFpuResTwo->r80Result2 = g_ar80One[0];
6364 }
6365 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6366 {
6367 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6368 {
6369 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6370 pFpuResTwo->r80Result1 = *pr80Val;
6371 }
6372 else
6373 {
6374 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6375 {
6376 pFpuResTwo->r80Result1 = *pr80Val;
6377 }
6378 else
6379 {
6380 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6381 }
6382
6383 pFpuResTwo->r80Result2 = g_ar80One[0];
6384
6385 fFsw |= X86_FSW_PE;
6386 if (!(fFcw & X86_FCW_PM))
6387 fFsw |= X86_FSW_ES | X86_FSW_B;
6388 }
6389 }
6390 else
6391 {
6392 fFsw |= X86_FSW_IE;
6393 if (!(fFcw & X86_FCW_IM))
6394 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6395 }
6396
6397 pFpuResTwo->FSW = fFsw;
6398}
6399#endif /* IEM_WITHOUT_ASSEMBLY */
6400
6401IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6402{
6403 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6404}
6405
6406IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6407{
6408 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6409}
6410
6411#ifdef IEM_WITHOUT_ASSEMBLY
6412
6413static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6414{
6415 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6416 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6417 extFloat80_t v;
6418 (void)fFcw;
6419
6420 v = extF80_sin(x, &SoftState);
6421
6422 iemFpuSoftF80ToIprt(pr80Result, v);
6423
6424 return fFsw;
6425}
6426
6427IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6428{
6429 uint16_t const fFcw = pFpuState->FCW;
6430 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6431
6432 if (RTFLOAT80U_IS_ZERO(pr80Val))
6433 {
6434 pFpuRes->r80Result = *pr80Val;
6435 }
6436 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6437 {
6438 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6439 {
6440 fFsw |= X86_FSW_C2;
6441 pFpuRes->r80Result = *pr80Val;
6442 }
6443 else
6444 {
6445 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6446 {
6447 pFpuRes->r80Result = *pr80Val;
6448 }
6449 else
6450 {
6451 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6452 }
6453 fFsw |= X86_FSW_PE;
6454 if (!(fFcw & X86_FCW_PM))
6455 fFsw |= X86_FSW_ES | X86_FSW_B;
6456 }
6457 }
6458 else if (RTFLOAT80U_IS_INF(pr80Val))
6459 {
6460 fFsw |= X86_FSW_IE;
6461 if (!(fFcw & X86_FCW_IM))
6462 {
6463 fFsw |= X86_FSW_ES | X86_FSW_B;
6464 pFpuRes->r80Result = *pr80Val;
6465 }
6466 else
6467 {
6468 pFpuRes->r80Result = g_r80Indefinite;
6469 }
6470 }
6471 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6472 {
6473 fFsw |= X86_FSW_DE;
6474
6475 if (fFcw & X86_FCW_DM)
6476 {
6477 if (fFcw & X86_FCW_UM)
6478 {
6479 pFpuRes->r80Result = *pr80Val;
6480 }
6481 else
6482 {
6483 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6484 uint64_t uMantissa = pr80Val->s.uMantissa;
6485 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6486
6487 uExponent = 64 - uExponent;
6488 uMantissa <<= uExponent;
6489 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6490
6491 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6492 pFpuRes->r80Result.s.uMantissa = uMantissa;
6493 pFpuRes->r80Result.s.uExponent = uExponent;
6494 }
6495
6496 fFsw |= X86_FSW_UE | X86_FSW_PE;
6497
6498 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6499 {
6500 /* All the exceptions are masked. */
6501 }
6502 else
6503 {
6504 fFsw |= X86_FSW_ES | X86_FSW_B;
6505 }
6506 }
6507 else
6508 {
6509 pFpuRes->r80Result = *pr80Val;
6510
6511 fFsw |= X86_FSW_ES | X86_FSW_B;
6512 }
6513 }
6514 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6515 {
6516 pFpuRes->r80Result = *pr80Val;
6517 fFsw |= X86_FSW_DE;
6518
6519 if (fFcw & X86_FCW_DM)
6520 {
6521 if (fFcw & X86_FCW_PM)
6522 {
6523 fFsw |= X86_FSW_PE;
6524 }
6525 else
6526 {
6527 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6528 }
6529
6530 pFpuRes->r80Result.sj64.uExponent = 1;
6531 }
6532 else
6533 {
6534 fFsw |= X86_FSW_ES | X86_FSW_B;
6535 }
6536 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6537 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6538 {
6539 pFpuRes->r80Result = *pr80Val;
6540 } else {
6541 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6542 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6543 && (fFcw & X86_FCW_IM))
6544 pFpuRes->r80Result = g_r80Indefinite;
6545 else
6546 {
6547 pFpuRes->r80Result = *pr80Val;
6548 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6549 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6550 }
6551
6552 fFsw |= X86_FSW_IE;
6553 if (!(fFcw & X86_FCW_IM))
6554 fFsw |= X86_FSW_ES | X86_FSW_B;
6555 }
6556
6557 pFpuRes->FSW = fFsw;
6558}
6559#endif /* IEM_WITHOUT_ASSEMBLY */
6560
6561IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6562{
6563 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6564}
6565
6566IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6567{
6568 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6569}
6570
6571#ifdef IEM_WITHOUT_ASSEMBLY
6572
6573static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6574{
6575 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6576 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6577 extFloat80_t v;
6578 (void)fFcw;
6579
6580 v = extF80_cos(x, &SoftState);
6581
6582 iemFpuSoftF80ToIprt(pr80Result, v);
6583
6584 return fFsw;
6585}
6586
6587IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6588{
6589 uint16_t const fFcw = pFpuState->FCW;
6590 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6591
6592 if (RTFLOAT80U_IS_ZERO(pr80Val))
6593 {
6594 pFpuRes->r80Result = g_ar80One[0];
6595 }
6596 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6597 {
6598 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6599 {
6600 fFsw |= X86_FSW_C2;
6601 pFpuRes->r80Result = *pr80Val;
6602 }
6603 else
6604 {
6605 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6606 {
6607 pFpuRes->r80Result = g_ar80One[0];
6608
6609 }
6610 else
6611 {
6612 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6613 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6614 }
6615 fFsw |= X86_FSW_PE;
6616 if (!(fFcw & X86_FCW_PM))
6617 fFsw |= X86_FSW_ES | X86_FSW_B;
6618 }
6619 }
6620 else if (RTFLOAT80U_IS_INF(pr80Val))
6621 {
6622 fFsw |= X86_FSW_IE;
6623 if (!(fFcw & X86_FCW_IM))
6624 {
6625 fFsw |= X86_FSW_ES | X86_FSW_B;
6626 pFpuRes->r80Result = *pr80Val;
6627 }
6628 else
6629 {
6630 pFpuRes->r80Result = g_r80Indefinite;
6631 }
6632 }
6633 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6634 {
6635 fFsw |= X86_FSW_DE;
6636
6637 if (fFcw & X86_FCW_DM)
6638 {
6639 pFpuRes->r80Result = g_ar80One[0];
6640
6641 if (fFcw & X86_FCW_PM)
6642 {
6643 fFsw |= X86_FSW_PE;
6644 }
6645 else
6646 {
6647 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6648 }
6649 }
6650 else
6651 {
6652 pFpuRes->r80Result = *pr80Val;
6653 fFsw |= X86_FSW_ES | X86_FSW_B;
6654 }
6655 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6656 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6657 {
6658 pFpuRes->r80Result = *pr80Val;
6659 } else {
6660 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6661 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6662 && (fFcw & X86_FCW_IM))
6663 pFpuRes->r80Result = g_r80Indefinite;
6664 else
6665 {
6666 pFpuRes->r80Result = *pr80Val;
6667 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6668 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6669 }
6670
6671 fFsw |= X86_FSW_IE;
6672 if (!(fFcw & X86_FCW_IM))
6673 fFsw |= X86_FSW_ES | X86_FSW_B;
6674 }
6675
6676 pFpuRes->FSW = fFsw;
6677}
6678#endif /* IEM_WITHOUT_ASSEMBLY */
6679
6680IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6681{
6682 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6683}
6684
6685IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6686{
6687 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6688}
6689
6690#ifdef IEM_WITHOUT_ASSEMBLY
6691
6692static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6693{
6694 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6695 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6696 extFloat80_t r80Sin, r80Cos;
6697 (void)fFcw;
6698
6699 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6700
6701 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6702 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6703
6704 return fFsw;
6705}
6706
6707IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6708{
6709 uint16_t const fFcw = pFpuState->FCW;
6710 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6711
6712 if (RTFLOAT80U_IS_ZERO(pr80Val))
6713 {
6714 pFpuResTwo->r80Result1 = *pr80Val;
6715 pFpuResTwo->r80Result2 = g_ar80One[0];
6716 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6717 }
6718 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6719 {
6720 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6721 {
6722 fFsw |= X86_FSW_C2;
6723
6724 if (fFcw & X86_FCW_IM)
6725 {
6726 pFpuResTwo->r80Result1 = g_r80Indefinite;
6727 }
6728 else
6729 {
6730 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6731 }
6732
6733 pFpuResTwo->r80Result2 = *pr80Val;
6734 }
6735 else
6736 {
6737 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6738
6739 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6740 {
6741 pFpuResTwo->r80Result1 = *pr80Val;
6742 pFpuResTwo->r80Result2 = g_ar80One[0];
6743 }
6744 else
6745 {
6746 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6747 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6748 }
6749 fFsw |= X86_FSW_PE;
6750 if (!(fFcw & X86_FCW_PM))
6751 fFsw |= X86_FSW_ES | X86_FSW_B;
6752 }
6753 }
6754 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6755 {
6756 fFsw |= X86_FSW_DE;
6757
6758 if (fFcw & X86_FCW_DM)
6759 {
6760 pFpuResTwo->r80Result1 = *pr80Val;
6761 pFpuResTwo->r80Result2 = g_ar80One[0];
6762 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6763
6764 if (fFcw & X86_FCW_PM)
6765 {
6766 fFsw |= X86_FSW_PE;
6767 }
6768 else
6769 {
6770 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6771 }
6772
6773 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6774 }
6775 else
6776 {
6777 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6778 pFpuResTwo->r80Result2 = *pr80Val;
6779 fFsw |= X86_FSW_ES | X86_FSW_B;
6780 }
6781 }
6782 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6783 {
6784 fFsw |= X86_FSW_DE;
6785
6786 if (fFcw & X86_FCW_DM)
6787 {
6788 pFpuResTwo->r80Result2 = g_ar80One[0];
6789
6790 if (fFcw & X86_FCW_UM)
6791 {
6792 pFpuResTwo->r80Result1 = *pr80Val;
6793 }
6794 else
6795 {
6796 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6797 uint64_t uMantissa = pr80Val->s.uMantissa;
6798 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6799
6800 uExponent = 64 - uExponent;
6801 uMantissa <<= uExponent;
6802 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6803
6804 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6805 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6806 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6807 }
6808
6809 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6810 fFsw |= X86_FSW_UE | X86_FSW_PE;
6811
6812 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6813 {
6814 /* All the exceptions are masked. */
6815 }
6816 else
6817 {
6818 fFsw |= X86_FSW_ES | X86_FSW_B;
6819 }
6820 }
6821 else
6822 {
6823 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6824 pFpuResTwo->r80Result2 = *pr80Val;
6825 fFsw |= X86_FSW_ES | X86_FSW_B;
6826 }
6827 }
6828 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6829 {
6830 pFpuResTwo->r80Result1 = *pr80Val;
6831 pFpuResTwo->r80Result2 = *pr80Val;
6832 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6833 }
6834 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6835 {
6836 if (fFcw & X86_FCW_IM)
6837 {
6838 pFpuResTwo->r80Result1 = g_r80Indefinite;
6839 pFpuResTwo->r80Result2 = g_r80Indefinite;
6840 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6841 }
6842 else
6843 {
6844 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6845 pFpuResTwo->r80Result2 = *pr80Val;
6846 }
6847
6848 fFsw |= X86_FSW_IE;
6849 if (!(fFcw & X86_FCW_IM))
6850 fFsw |= X86_FSW_ES | X86_FSW_B;
6851 }
6852 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6853 {
6854 pFpuResTwo->r80Result1 = *pr80Val;
6855 pFpuResTwo->r80Result2 = *pr80Val;
6856
6857 if (fFcw & X86_FCW_IM)
6858 {
6859 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6860 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6861 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6862 }
6863 else
6864 {
6865 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6866 pFpuResTwo->r80Result2 = *pr80Val;
6867 }
6868
6869 fFsw |= X86_FSW_IE;
6870 if (!(fFcw & X86_FCW_IM))
6871 fFsw |= X86_FSW_ES | X86_FSW_B;
6872 }
6873 else if (RTFLOAT80U_IS_INF(pr80Val))
6874 {
6875 if (fFcw & X86_FCW_IM)
6876 {
6877 pFpuResTwo->r80Result1 = g_r80Indefinite;
6878 pFpuResTwo->r80Result2 = g_r80Indefinite;
6879 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6880 }
6881 else
6882 {
6883 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6884 pFpuResTwo->r80Result2 = *pr80Val;
6885 }
6886
6887 fFsw |= X86_FSW_IE;
6888 if (!(fFcw & X86_FCW_IM))
6889 fFsw |= X86_FSW_ES | X86_FSW_B;
6890 }
6891
6892 pFpuResTwo->FSW = fFsw;
6893}
6894#endif /* IEM_WITHOUT_ASSEMBLY */
6895
6896IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6897{
6898 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6899}
6900
6901IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6902{
6903 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6904}
6905
6906#ifdef IEM_WITHOUT_ASSEMBLY
6907
6908
6909/*********************************************************************************************************************************
6910* x87 FPU Compare and Testing Operations *
6911*********************************************************************************************************************************/
6912
6913IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6914{
6915 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6916
6917 if (RTFLOAT80U_IS_ZERO(pr80Val))
6918 fFsw |= X86_FSW_C3;
6919 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6920 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6921 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6922 {
6923 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6924 if (!(pFpuState->FCW & X86_FCW_DM))
6925 fFsw |= X86_FSW_ES | X86_FSW_B;
6926 }
6927 else
6928 {
6929 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6930 if (!(pFpuState->FCW & X86_FCW_IM))
6931 fFsw |= X86_FSW_ES | X86_FSW_B;
6932 }
6933
6934 *pu16Fsw = fFsw;
6935}
6936
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6939{
6940 RT_NOREF(pFpuState);
6941 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6942
6943 /* C1 = sign bit (always, even if empty Intel says). */
6944 if (pr80Val->s.fSign)
6945 fFsw |= X86_FSW_C1;
6946
6947 /* Classify the value in C0, C2, C3. */
6948 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6949 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6950 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6951 fFsw |= X86_FSW_C2;
6952 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6953 fFsw |= X86_FSW_C3;
6954 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6955 fFsw |= X86_FSW_C0;
6956 else if (RTFLOAT80U_IS_INF(pr80Val))
6957 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6958 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6959 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6960 /* whatever else: 0 */
6961
6962 *pu16Fsw = fFsw;
6963}
6964
6965
6966/**
6967 * Worker for fcom, fucom, and friends.
6968 */
6969static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6970 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6971{
6972 /*
6973 * Unpack the values.
6974 */
6975 bool const fSign1 = pr80Val1->s.fSign;
6976 int32_t iExponent1 = pr80Val1->s.uExponent;
6977 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6978
6979 bool const fSign2 = pr80Val2->s.fSign;
6980 int32_t iExponent2 = pr80Val2->s.uExponent;
6981 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6982
6983 /*
6984 * Check for invalid inputs.
6985 */
6986 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6987 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6988 {
6989 if (!(fFcw & X86_FCW_IM))
6990 fFsw |= X86_FSW_ES | X86_FSW_B;
6991 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6992 }
6993
6994 /*
6995 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6996 */
6997 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6998 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6999 {
7000 if ( fIeOnAllNaNs
7001 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7002 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7003 {
7004 fFsw |= X86_FSW_IE;
7005 if (!(fFcw & X86_FCW_IM))
7006 fFsw |= X86_FSW_ES | X86_FSW_B;
7007 }
7008 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
7009 }
7010
7011 /*
7012 * Normalize the values.
7013 */
7014 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7015 {
7016 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7017 iExponent1 = 1;
7018 else
7019 {
7020 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7021 uMantissa1 <<= iExponent1;
7022 iExponent1 = 1 - iExponent1;
7023 }
7024 fFsw |= X86_FSW_DE;
7025 if (!(fFcw & X86_FCW_DM))
7026 fFsw |= X86_FSW_ES | X86_FSW_B;
7027 }
7028
7029 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7030 {
7031 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7032 iExponent2 = 1;
7033 else
7034 {
7035 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7036 uMantissa2 <<= iExponent2;
7037 iExponent2 = 1 - iExponent2;
7038 }
7039 fFsw |= X86_FSW_DE;
7040 if (!(fFcw & X86_FCW_DM))
7041 fFsw |= X86_FSW_ES | X86_FSW_B;
7042 }
7043
7044 /*
7045 * Test if equal (val1 == val2):
7046 */
7047 if ( uMantissa1 == uMantissa2
7048 && iExponent1 == iExponent2
7049 && ( fSign1 == fSign2
7050 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7051 fFsw |= X86_FSW_C3;
7052 /*
7053 * Test if less than (val1 < val2):
7054 */
7055 else if (fSign1 && !fSign2)
7056 fFsw |= X86_FSW_C0;
7057 else if (fSign1 == fSign2)
7058 {
7059 /* Zeros are problematic, however at the most one can be zero here. */
7060 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7061 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7062 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7063 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7064
7065 if ( fSign1
7066 ^ ( iExponent1 < iExponent2
7067 || ( iExponent1 == iExponent2
7068 && uMantissa1 < uMantissa2 ) ) )
7069 fFsw |= X86_FSW_C0;
7070 }
7071 /* else: No flags set if greater. */
7072
7073 return fFsw;
7074}
7075
7076
7077IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7078 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7079{
7080 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7081}
7082
7083
7084
7085
7086IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7087 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7088{
7089 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7090}
7091
7092
7093IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7094 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7095{
7096 RTFLOAT80U r80Val2;
7097 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7098 Assert(!fFsw || fFsw == X86_FSW_DE);
7099 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7100 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7101 {
7102 if (!(pFpuState->FCW & X86_FCW_DM))
7103 fFsw |= X86_FSW_ES | X86_FSW_B;
7104 *pfFsw |= fFsw;
7105 }
7106}
7107
7108
7109IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7110 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7111{
7112 RTFLOAT80U r80Val2;
7113 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7114 Assert(!fFsw || fFsw == X86_FSW_DE);
7115 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7116 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7117 {
7118 if (!(pFpuState->FCW & X86_FCW_DM))
7119 fFsw |= X86_FSW_ES | X86_FSW_B;
7120 *pfFsw |= fFsw;
7121 }
7122}
7123
7124
7125IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7126 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7127{
7128 RTFLOAT80U r80Val2;
7129 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7130 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7131}
7132
7133
7134IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7135 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7136{
7137 RTFLOAT80U r80Val2;
7138 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7139 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7140}
7141
7142
7143/**
7144 * Worker for fcomi & fucomi.
7145 */
7146static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7147 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7148{
7149 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7150 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7151 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7152 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7153
7154 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7155 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7156 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7157}
7158
7159
7160IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7161 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7162{
7163 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7164}
7165
7166
7167IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7168 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7169{
7170 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7171}
7172
7173
7174/*********************************************************************************************************************************
7175* x87 FPU Other Operations *
7176*********************************************************************************************************************************/
7177
7178/**
7179 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7180 */
7181static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7182{
7183 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7184 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7185 true /*exact / generate #PE */, &SoftState));
7186 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7187}
7188
7189
7190IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7191{
7192 uint16_t const fFcw = pFpuState->FCW;
7193 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7194
7195 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7196 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7197 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7198 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7199 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7200 || RTFLOAT80U_IS_INF(pr80Val))
7201 pFpuRes->r80Result = *pr80Val;
7202 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7203 {
7204 fFsw |= X86_FSW_DE;
7205 if (fFcw & X86_FCW_DM)
7206 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7207 else
7208 {
7209 pFpuRes->r80Result = *pr80Val;
7210 fFsw |= X86_FSW_ES | X86_FSW_B;
7211 }
7212 }
7213 else
7214 {
7215 if (fFcw & X86_FCW_IM)
7216 {
7217 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7218 pFpuRes->r80Result = g_r80Indefinite;
7219 else
7220 {
7221 pFpuRes->r80Result = *pr80Val;
7222 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7223 }
7224 }
7225 else
7226 {
7227 pFpuRes->r80Result = *pr80Val;
7228 fFsw |= X86_FSW_ES | X86_FSW_B;
7229 }
7230 fFsw |= X86_FSW_IE;
7231 }
7232 pFpuRes->FSW = fFsw;
7233}
7234
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7237 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7238{
7239 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7240 it does everything we need it to do. */
7241 uint16_t const fFcw = pFpuState->FCW;
7242 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7243 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7244 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7245 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7246}
7247
7248
7249/**
7250 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7251 */
7252static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7253{
7254 Assert(!pr80Val->s.fSign);
7255 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7256 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7257 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7258}
7259
7260
7261IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7262{
7263 uint16_t const fFcw = pFpuState->FCW;
7264 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7265
7266 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7267 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7268 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7269 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7270 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7271 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7272 pFpuRes->r80Result = *pr80Val;
7273 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7274 {
7275 fFsw |= X86_FSW_DE;
7276 if (fFcw & X86_FCW_DM)
7277 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7278 else
7279 {
7280 pFpuRes->r80Result = *pr80Val;
7281 fFsw |= X86_FSW_ES | X86_FSW_B;
7282 }
7283 }
7284 else
7285 {
7286 if (fFcw & X86_FCW_IM)
7287 {
7288 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7289 pFpuRes->r80Result = g_r80Indefinite;
7290 else
7291 {
7292 pFpuRes->r80Result = *pr80Val;
7293 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7294 }
7295 }
7296 else
7297 {
7298 pFpuRes->r80Result = *pr80Val;
7299 fFsw |= X86_FSW_ES | X86_FSW_B;
7300 }
7301 fFsw |= X86_FSW_IE;
7302 }
7303 pFpuRes->FSW = fFsw;
7304}
7305
7306
7307/**
7308 * @code{.unparsed}
7309 * x x * ln2
7310 * f(x) = 2 - 1 = e - 1
7311 *
7312 * @endcode
7313 *
7314 * We can approximate e^x by a Taylor/Maclaurin series (see
7315 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7316 * @code{.unparsed}
7317 * n 0 1 2 3 4
7318 * inf x x x x x x
7319 * SUM ----- = --- + --- + --- + --- + --- + ...
7320 * n=0 n! 0! 1! 2! 3! 4!
7321 *
7322 * 2 3 4
7323 * x x x
7324 * = 1 + x + --- + --- + --- + ...
7325 * 2! 3! 4!
7326 * @endcode
7327 *
7328 * Given z = x * ln2, we get:
7329 * @code{.unparsed}
7330 * 2 3 4 n
7331 * z z z z z
7332 * e - 1 = z + --- + --- + --- + ... + ---
7333 * 2! 3! 4! n!
7334 * @endcode
7335 *
7336 * Wanting to use Horner's method, we move one z outside and get:
7337 * @code{.unparsed}
7338 * 2 3 (n-1)
7339 * z z z z
7340 * = z ( 1 + --- + --- + --- + ... + ------- )
7341 * 2! 3! 4! n!
7342 * @endcode
7343 *
7344 * The constants we need for using Horner's methods are 1 and 1 / n!.
7345 *
7346 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7347 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7348 * and can approximate it to be 1.0. For a visual demonstration of this
7349 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7350 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7351 *
7352 *
7353 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7354 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7355 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7356 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7357 * blocks). (The one bit difference is probably an implicit one missing from
7358 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7359 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7360 * exponent.
7361 *
7362 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7363 * successfully reproduced the exact results from an Intel 10980XE, there is
7364 * always a portition of rounding differences. Not going to spend too much time
7365 * on getting this 100% the same, at least not now.
7366 *
7367 * P.S. If someone are really curious about 8087 and its contstants:
7368 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7369 *
7370 *
7371 * @param pr80Val The exponent value (x), less than 1.0, greater than
7372 * -1.0 and not zero. This can be a normal, denormal
7373 * or pseudo-denormal value.
7374 * @param pr80Result Where to return the result.
7375 * @param fFcw FPU control word.
7376 * @param fFsw FPU status word.
7377 */
7378static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7379{
7380 /* As mentioned above, we can skip the expensive polynomial calculation
7381 as it will be close enough to 1.0 that it makes no difference.
7382
7383 The cutoff point for intel 10980XE is exponents >= -69. Intel
7384 also seems to be using a 67-bit or 68-bit constant value, and we get
7385 a smattering of rounding differences if we go for higher precision. */
7386 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7387 {
7388 RTUINT256U u256;
7389 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7390 u256.QWords.qw0 |= 1; /* force #PE */
7391 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7392 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7393 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7394 : 1 - RTFLOAT80U_EXP_BIAS,
7395 fFcw, fFsw);
7396 }
7397 else
7398 {
7399#ifdef IEM_WITH_FLOAT128_FOR_FPU
7400 /* This approach is not good enough for small values, we end up with zero. */
7401 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7402 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7403 _Float128 rd128Result = powf128(2.0L, rd128Val);
7404 rd128Result -= 1.0L;
7405 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7406 iemFpuF128RestoreRounding(fOldRounding);
7407
7408# else
7409 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7410 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7411
7412 /* As mentioned above, enforce 68-bit internal mantissa width to better
7413 match the Intel 10980XE results. */
7414 unsigned const cPrecision = 68;
7415
7416 /* first calculate z = x * ln2 */
7417 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7418 cPrecision);
7419
7420 /* Then do the polynomial evaluation. */
7421 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7422 cPrecision, &SoftState);
7423 r = f128_mul(z, r, &SoftState);
7424
7425 /* Output the result. */
7426 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7427# endif
7428 }
7429 return fFsw;
7430}
7431
7432
7433IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7434{
7435 uint16_t const fFcw = pFpuState->FCW;
7436 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7437
7438 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7439 {
7440 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7441 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7442 else
7443 {
7444 /* Special case:
7445 2^+1.0 - 1.0 = 1.0
7446 2^-1.0 - 1.0 = -0.5 */
7447 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7448 && pr80Val->s.uMantissa == RT_BIT_64(63))
7449 {
7450 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7451 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7452 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7453 }
7454 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7455 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7456 else
7457 pFpuRes->r80Result = *pr80Val;
7458 fFsw |= X86_FSW_PE;
7459 if (!(fFcw & X86_FCW_PM))
7460 fFsw |= X86_FSW_ES | X86_FSW_B;
7461 }
7462 }
7463 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7464 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7465 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7466 pFpuRes->r80Result = *pr80Val;
7467 else if (RTFLOAT80U_IS_INF(pr80Val))
7468 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7469 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7470 {
7471 fFsw |= X86_FSW_DE;
7472 if (fFcw & X86_FCW_DM)
7473 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7474 else
7475 {
7476 pFpuRes->r80Result = *pr80Val;
7477 fFsw |= X86_FSW_ES | X86_FSW_B;
7478 }
7479 }
7480 else
7481 {
7482 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7483 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7484 && (fFcw & X86_FCW_IM))
7485 pFpuRes->r80Result = g_r80Indefinite;
7486 else
7487 {
7488 pFpuRes->r80Result = *pr80Val;
7489 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7490 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7491 }
7492 fFsw |= X86_FSW_IE;
7493 if (!(fFcw & X86_FCW_IM))
7494 fFsw |= X86_FSW_ES | X86_FSW_B;
7495 }
7496 pFpuRes->FSW = fFsw;
7497}
7498
7499#endif /* IEM_WITHOUT_ASSEMBLY */
7500
7501IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7502{
7503 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7504}
7505
7506IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7507{
7508 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7509}
7510
7511#ifdef IEM_WITHOUT_ASSEMBLY
7512
7513IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7514{
7515 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7516 pFpuRes->r80Result = *pr80Val;
7517 pFpuRes->r80Result.s.fSign = 0;
7518}
7519
7520
7521IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7522{
7523 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7524 pFpuRes->r80Result = *pr80Val;
7525 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7526}
7527
7528
7529IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7530{
7531 uint16_t const fFcw = pFpuState->FCW;
7532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7533
7534 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7535 {
7536 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7537 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7538
7539 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7540 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7541 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7542 }
7543 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7544 {
7545 fFsw |= X86_FSW_ZE;
7546 if (fFcw & X86_FCW_ZM)
7547 {
7548 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7549 pFpuResTwo->r80Result2 = *pr80Val;
7550 }
7551 else
7552 {
7553 pFpuResTwo->r80Result2 = *pr80Val;
7554 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7555 }
7556 }
7557 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7558 {
7559 fFsw |= X86_FSW_DE;
7560 if (fFcw & X86_FCW_DM)
7561 {
7562 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7563 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7564 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7565 int32_t iExponent = -16382;
7566 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7567 {
7568 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7569 iExponent--;
7570 }
7571
7572 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7573 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7574 }
7575 else
7576 {
7577 pFpuResTwo->r80Result2 = *pr80Val;
7578 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7579 }
7580 }
7581 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7582 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7583 {
7584 pFpuResTwo->r80Result1 = *pr80Val;
7585 pFpuResTwo->r80Result2 = *pr80Val;
7586 }
7587 else if (RTFLOAT80U_IS_INF(pr80Val))
7588 {
7589 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7590 pFpuResTwo->r80Result2 = *pr80Val;
7591 }
7592 else
7593 {
7594 if (fFcw & X86_FCW_IM)
7595 {
7596 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7597 pFpuResTwo->r80Result1 = g_r80Indefinite;
7598 else
7599 {
7600 pFpuResTwo->r80Result1 = *pr80Val;
7601 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7602 }
7603 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7604 }
7605 else
7606 {
7607 pFpuResTwo->r80Result2 = *pr80Val;
7608 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7609 }
7610 fFsw |= X86_FSW_IE;
7611 }
7612 pFpuResTwo->FSW = fFsw;
7613}
7614#endif /* IEM_WITHOUT_ASSEMBLY */
7615
7616#if defined(IEM_WITHOUT_ASSEMBLY)
7617
7618static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7619{
7620 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7621 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7622 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7623 extFloat80_t v;
7624 (void)fFcw;
7625
7626 v = extF80_ylog2x(y, x, &SoftState);
7627 iemFpuSoftF80ToIprt(pr80Result, v);
7628
7629 return fFsw;
7630}
7631
7632IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7633 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7634{
7635 uint16_t const fFcw = pFpuState->FCW;
7636 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7637
7638 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7639 {
7640 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7641
7642 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7643 if (!(fFcw & X86_FCW_PM))
7644 fFsw |= X86_FSW_ES | X86_FSW_B;
7645 }
7646 else
7647 {
7648 fFsw |= X86_FSW_IE;
7649
7650 if (!(fFcw & X86_FCW_IM))
7651 {
7652 pFpuRes->r80Result = *pr80Val2;
7653 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7654 }
7655 else
7656 {
7657 pFpuRes->r80Result = g_r80Indefinite;
7658 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7659 }
7660 }
7661
7662 pFpuRes->FSW = fFsw;
7663}
7664#endif /* IEM_WITHOUT_ASSEMBLY */
7665
7666IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7667 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7668{
7669 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7670}
7671
7672IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7673 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7674{
7675 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7676}
7677
7678#if defined(IEM_WITHOUT_ASSEMBLY)
7679
7680static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7681{
7682 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7683 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7684 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7685 extFloat80_t v;
7686 (void)fFcw;
7687
7688 v = extF80_ylog2xp1(y, x, &SoftState);
7689 iemFpuSoftF80ToIprt(pr80Result, v);
7690
7691 return fFsw;
7692}
7693
7694IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7695 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7696{
7697 uint16_t const fFcw = pFpuState->FCW;
7698 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7699
7700 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7701 {
7702 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7703
7704 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7705 if (!(fFcw & X86_FCW_PM))
7706 fFsw |= X86_FSW_ES | X86_FSW_B;
7707 }
7708 else
7709 {
7710 fFsw |= X86_FSW_IE;
7711
7712 if (!(fFcw & X86_FCW_IM))
7713 {
7714 pFpuRes->r80Result = *pr80Val2;
7715 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7716 }
7717 else
7718 {
7719 pFpuRes->r80Result = g_r80Indefinite;
7720 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7721 }
7722 }
7723
7724 pFpuRes->FSW = fFsw;
7725}
7726
7727#endif /* IEM_WITHOUT_ASSEMBLY */
7728
7729IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7730 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7731{
7732 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7733}
7734
7735IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7736 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7737{
7738 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7739}
7740
7741
7742/*********************************************************************************************************************************
7743* MMX, SSE & AVX *
7744*********************************************************************************************************************************/
7745
7746#ifdef IEM_WITH_VEX
7747
7748/*
7749 * VMOVSLDUP
7750 */
7751IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7752{
7753 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7754 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7755 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7756 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7757 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7758 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7759 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7760 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7761}
7762
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7765{
7766 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7767 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7768 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7769 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7770 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7771 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7772 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7773 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7774}
7775
7776#endif /* IEM_WITH_VEX */
7777
7778
7779#ifdef IEM_WITH_VEX
7780
7781/*
7782 * VMOVSHDUP
7783 */
7784IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7785{
7786 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7787 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7788 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7789 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7790 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7791 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7792 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7793 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7794}
7795
7796
7797IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7798{
7799 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7800 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7801 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7802 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7803 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7804 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7805 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7806 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7807}
7808
7809#endif /* IEM_WITH_VEX */
7810
7811
7812#ifdef IEM_WITH_VEX
7813
7814/*
7815 * VMOVDDUP
7816 */
7817IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7818{
7819 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7820 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7821 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7822 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7823}
7824
7825IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7826{
7827 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7828 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7829 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7830 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7831}
7832
7833#endif /* IEM_WITH_VEX */
7834
7835
7836/*
7837 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7838 */
7839#ifdef IEM_WITHOUT_ASSEMBLY
7840
7841IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7842{
7843 RT_NOREF(pFpuState);
7844 *puDst &= *puSrc;
7845}
7846
7847
7848IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7849{
7850 RT_NOREF(pFpuState);
7851 puDst->au64[0] &= puSrc->au64[0];
7852 puDst->au64[1] &= puSrc->au64[1];
7853}
7854
7855#endif
7856
7857IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7858 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7859{
7860 RT_NOREF(pExtState);
7861 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7862 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7863}
7864
7865
7866IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7867 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7868{
7869 RT_NOREF(pExtState);
7870 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7871 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7872 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7873 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7874}
7875
7876
7877/*
7878 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7879 */
7880#ifdef IEM_WITHOUT_ASSEMBLY
7881
7882IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7883{
7884 RT_NOREF(pFpuState);
7885 *puDst = ~*puDst & *puSrc;
7886}
7887
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7890{
7891 RT_NOREF(pFpuState);
7892 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7893 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7894}
7895
7896#endif
7897
7898IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7899 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7900{
7901 RT_NOREF(pExtState);
7902 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7903 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7904}
7905
7906
7907IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7908 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7909{
7910 RT_NOREF(pExtState);
7911 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7912 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7913 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7914 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7915}
7916
7917
7918/*
7919 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7920 */
7921#ifdef IEM_WITHOUT_ASSEMBLY
7922
7923IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7924{
7925 RT_NOREF(pFpuState);
7926 *puDst |= *puSrc;
7927}
7928
7929
7930IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7931{
7932 RT_NOREF(pFpuState);
7933 puDst->au64[0] |= puSrc->au64[0];
7934 puDst->au64[1] |= puSrc->au64[1];
7935}
7936
7937#endif
7938
7939IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7940 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7941{
7942 RT_NOREF(pExtState);
7943 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7944 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7945}
7946
7947
7948IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7949 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7950{
7951 RT_NOREF(pExtState);
7952 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7953 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7954 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7955 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7956}
7957
7958
7959/*
7960 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7961 */
7962#ifdef IEM_WITHOUT_ASSEMBLY
7963
7964IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7965{
7966 RT_NOREF(pFpuState);
7967 *puDst ^= *puSrc;
7968}
7969
7970
7971IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7972{
7973 RT_NOREF(pFpuState);
7974 puDst->au64[0] ^= puSrc->au64[0];
7975 puDst->au64[1] ^= puSrc->au64[1];
7976}
7977
7978#endif
7979
7980IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7981 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7982{
7983 RT_NOREF(pExtState);
7984 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7985 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7986}
7987
7988
7989IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7990 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7991{
7992 RT_NOREF(pExtState);
7993 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7994 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7995 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7996 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7997}
7998
7999
8000/*
8001 * PCMPEQB / VPCMPEQB
8002 */
8003#ifdef IEM_WITHOUT_ASSEMBLY
8004
8005IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8006{
8007 RT_NOREF(pFpuState);
8008 RTUINT64U uSrc1 = { *puDst };
8009 RTUINT64U uSrc2 = { *puSrc };
8010 RTUINT64U uDst;
8011 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
8012 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8013 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8014 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8015 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8016 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8017 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8018 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8019 *puDst = uDst.u;
8020}
8021
8022
8023IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8024{
8025 RT_NOREF(pFpuState);
8026 RTUINT128U uSrc1 = *puDst;
8027 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8028 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8029 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8030 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8031 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8032 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8033 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8034 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8035 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8036 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8037 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8038 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8039 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8040 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8041 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8042 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8043}
8044
8045#endif
8046
8047IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8048 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8049{
8050 RT_NOREF(pExtState);
8051 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8052 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8053 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8054 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8055 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8056 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8057 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8058 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8059 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8060 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8061 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8062 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8063 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8064 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8065 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8066 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8067}
8068
8069IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8070 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8071{
8072 RT_NOREF(pExtState);
8073 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8074 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8075 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8076 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8077 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8078 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8079 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8080 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8081 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8082 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8083 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8084 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8085 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8086 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8087 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8088 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8089 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8090 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8091 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8092 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8093 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8094 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8095 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8096 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8097 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8098 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8099 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8100 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8101 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8102 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8103 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8104 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8105}
8106
8107
8108/*
8109 * PCMPEQW / VPCMPEQW
8110 */
8111#ifdef IEM_WITHOUT_ASSEMBLY
8112
8113IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8114{
8115 RT_NOREF(pFpuState);
8116 RTUINT64U uSrc1 = { *puDst };
8117 RTUINT64U uSrc2 = { *puSrc };
8118 RTUINT64U uDst;
8119 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8120 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8121 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8122 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8123 *puDst = uDst.u;
8124}
8125
8126
8127IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8128{
8129 RT_NOREF(pFpuState);
8130 RTUINT128U uSrc1 = *puDst;
8131 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8132 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8133 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8134 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8135 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8136 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8137 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8138 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8139}
8140
8141#endif
8142
8143IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8144 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8145{
8146 RT_NOREF(pExtState);
8147 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8148 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8149 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8150 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8151 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8152 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8153 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8154 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8155}
8156
8157IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8158 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8159{
8160 RT_NOREF(pExtState);
8161 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8162 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8163 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8164 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8165 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8166 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8167 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8168 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8169 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8170 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8171 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8172 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8173 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8174 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8175 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8176 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8177}
8178
8179
8180/*
8181 * PCMPEQD / VPCMPEQD.
8182 */
8183#ifdef IEM_WITHOUT_ASSEMBLY
8184
8185IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8186{
8187 RT_NOREF(pFpuState);
8188 RTUINT64U uSrc1 = { *puDst };
8189 RTUINT64U uSrc2 = { *puSrc };
8190 RTUINT64U uDst;
8191 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8192 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8193 *puDst = uDst.u;
8194}
8195
8196
8197IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8198{
8199 RT_NOREF(pFpuState);
8200 RTUINT128U uSrc1 = *puDst;
8201 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8202 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8203 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8204 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8205}
8206
8207#endif /* IEM_WITHOUT_ASSEMBLY */
8208
8209IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8210 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8211{
8212 RT_NOREF(pExtState);
8213 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8214 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8215 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8216 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8217}
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8220 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8221{
8222 RT_NOREF(pExtState);
8223 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8224 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8225 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8226 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8227 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8228 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8229 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8230 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8231}
8232
8233
8234/*
8235 * PCMPEQQ / VPCMPEQQ.
8236 */
8237IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8238{
8239 RT_NOREF(pFpuState);
8240 RTUINT128U uSrc1 = *puDst;
8241 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8242 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8243}
8244
8245IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8246 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8247{
8248 RT_NOREF(pExtState);
8249 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8250 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8251}
8252
8253IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8254 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8255{
8256 RT_NOREF(pExtState);
8257 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8258 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8259 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8260 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8261}
8262
8263
8264/*
8265 * PCMPGTB / VPCMPGTB
8266 */
8267#ifdef IEM_WITHOUT_ASSEMBLY
8268
8269IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8270{
8271 RT_NOREF(pFpuState);
8272 RTUINT64U uSrc1 = { *puDst };
8273 RTUINT64U uSrc2 = { *puSrc };
8274 RTUINT64U uDst;
8275 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8276 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8277 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8278 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8279 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8280 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8281 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8282 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8283 *puDst = uDst.u;
8284}
8285
8286
8287IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8288{
8289 RT_NOREF(pFpuState);
8290 RTUINT128U uSrc1 = *puDst;
8291 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8292 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8293 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8294 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8295 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8296 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8297 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8298 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8299 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8300 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8301 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8302 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8303 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8304 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8305 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8306 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8307}
8308
8309#endif
8310
8311IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8312 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8313{
8314 RT_NOREF(pExtState);
8315 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8316 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8317 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8318 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8319 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8320 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8321 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8322 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8323 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8324 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8325 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8326 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8327 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8328 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8329 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8330 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8331}
8332
8333IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8334 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8335{
8336 RT_NOREF(pExtState);
8337 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8338 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8339 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8340 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8341 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8342 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8343 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8344 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8345 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8346 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8347 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8348 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8349 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8350 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8351 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8352 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8353 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8354 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8355 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8356 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8357 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8358 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8359 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8360 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8361 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8362 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8363 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8364 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8365 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8366 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8367 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8368 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8369}
8370
8371
8372/*
8373 * PCMPGTW / VPCMPGTW
8374 */
8375#ifdef IEM_WITHOUT_ASSEMBLY
8376
8377IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8378{
8379 RT_NOREF(pFpuState);
8380 RTUINT64U uSrc1 = { *puDst };
8381 RTUINT64U uSrc2 = { *puSrc };
8382 RTUINT64U uDst;
8383 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8384 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8385 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8386 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8387 *puDst = uDst.u;
8388}
8389
8390
8391IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8392{
8393 RT_NOREF(pFpuState);
8394 RTUINT128U uSrc1 = *puDst;
8395 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8396 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8397 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8398 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8399 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8400 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8401 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8402 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8403}
8404
8405#endif
8406
8407IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8408 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8409{
8410 RT_NOREF(pExtState);
8411 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8412 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8413 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8414 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8415 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8416 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8417 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8418 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8419}
8420
8421IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8422 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8423{
8424 RT_NOREF(pExtState);
8425 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8426 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8427 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8428 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8429 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8430 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8431 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8432 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8433 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8434 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8435 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8436 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8437 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8438 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8439 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8440 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8441}
8442
8443
8444/*
8445 * PCMPGTD / VPCMPGTD.
8446 */
8447#ifdef IEM_WITHOUT_ASSEMBLY
8448
8449IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8450{
8451 RT_NOREF(pFpuState);
8452 RTUINT64U uSrc1 = { *puDst };
8453 RTUINT64U uSrc2 = { *puSrc };
8454 RTUINT64U uDst;
8455 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8456 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8457 *puDst = uDst.u;
8458}
8459
8460
8461IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8462{
8463 RT_NOREF(pFpuState);
8464 RTUINT128U uSrc1 = *puDst;
8465 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8466 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8467 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8468 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8469}
8470
8471#endif /* IEM_WITHOUT_ASSEMBLY */
8472
8473IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8474 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8475{
8476 RT_NOREF(pExtState);
8477 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8478 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8479 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8480 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8481}
8482
8483IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8484 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8485{
8486 RT_NOREF(pExtState);
8487 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8488 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8489 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8490 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8491 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8492 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8493 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8494 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8495}
8496
8497
8498/*
8499 * PCMPGTQ / VPCMPGTQ.
8500 */
8501IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8502{
8503 RT_NOREF(pFpuState);
8504 RTUINT128U uSrc1 = *puDst;
8505 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8506 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8507}
8508
8509IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8510 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8511{
8512 RT_NOREF(pExtState);
8513 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8514 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8515}
8516
8517IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8518 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8519{
8520 RT_NOREF(pExtState);
8521 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8522 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8523 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8524 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8525}
8526
8527
8528/*
8529 * PADDB / VPADDB
8530 */
8531#ifdef IEM_WITHOUT_ASSEMBLY
8532
8533IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8534{
8535 RT_NOREF(pFpuState);
8536 RTUINT64U uSrc1 = { *puDst };
8537 RTUINT64U uSrc2 = { *puSrc };
8538 RTUINT64U uDst;
8539 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8540 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8541 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8542 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8543 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8544 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8545 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8546 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8547 *puDst = uDst.u;
8548}
8549
8550
8551IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8552{
8553 RT_NOREF(pFpuState);
8554 RTUINT128U uSrc1 = *puDst;
8555 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8556 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8557 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8558 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8559 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8560 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8561 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8562 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8563 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8564 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8565 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8566 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8567 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8568 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8569 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8570 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8571}
8572
8573#endif
8574
8575
8576IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8577 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8578{
8579 RT_NOREF(pExtState);
8580 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8581 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8582 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8583 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8584 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8585 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8586 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8587 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8588 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8589 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8590 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8591 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8592 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8593 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8594 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8595 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8596}
8597
8598IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8599 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8600{
8601 RT_NOREF(pExtState);
8602 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8603 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8604 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8605 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8606 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8607 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8608 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8609 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8610 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8611 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8612 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8613 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8614 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8615 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8616 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8617 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8618 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8619 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8620 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8621 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8622 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8623 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8624 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8625 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8626 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8627 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8628 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8629 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8630 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8631 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8632 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8633 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8634}
8635
8636
8637/*
8638 * PADDSB / VPADDSB
8639 */
8640#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8641 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8642 ? (uint8_t)(a_iWord) \
8643 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8644
8645#ifdef IEM_WITHOUT_ASSEMBLY
8646
8647IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8648{
8649 RT_NOREF(pFpuState);
8650 RTUINT64U uSrc1 = { *puDst };
8651 RTUINT64U uSrc2 = { *puSrc };
8652 RTUINT64U uDst;
8653 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8654 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8655 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8656 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8657 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8658 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8659 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8660 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8661 *puDst = uDst.u;
8662}
8663
8664
8665IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8666{
8667 RT_NOREF(pFpuState);
8668 RTUINT128U uSrc1 = *puDst;
8669 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8670 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8671 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8672 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8673 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8674 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8675 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8676 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8677 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8678 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8679 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8680 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8681 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8682 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8683 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8684 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8685}
8686
8687#endif
8688
8689IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8690 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8691{
8692 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8693 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8694 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8695 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8696 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8697 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8698 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8699 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8700 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8701 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8702 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8703 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8704 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8705 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8706 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8707 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8708}
8709
8710IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8711 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8712{
8713 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8714 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8715 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8716 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8717 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8718 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8719 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8720 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8721 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8722 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8723 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8724 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8725 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8726 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8727 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8728 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8729 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8730 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8731 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8732 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8733 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8734 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8735 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8736 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8737 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8738 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8739 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8740 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8741 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8742 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8743 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8744 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8745}
8746
8747
8748/*
8749 * PADDUSB / VPADDUSB
8750 */
8751#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8752 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8753 ? (uint8_t)(a_uWord) \
8754 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8755
8756#ifdef IEM_WITHOUT_ASSEMBLY
8757
8758IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8759{
8760 RT_NOREF(pFpuState);
8761 RTUINT64U uSrc1 = { *puDst };
8762 RTUINT64U uSrc2 = { *puSrc };
8763 RTUINT64U uDst;
8764 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8765 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8766 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8767 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8768 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8769 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8770 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8771 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8772 *puDst = uDst.u;
8773}
8774
8775
8776IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8777{
8778 RT_NOREF(pFpuState);
8779 RTUINT128U uSrc1 = *puDst;
8780 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8781 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8782 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8783 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8784 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8785 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8786 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8787 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8788 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8789 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8790 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8791 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8792 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8793 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8794 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8795 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8796}
8797
8798#endif
8799
8800IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8801 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8802{
8803 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8804 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8805 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8806 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8807 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8808 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8809 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8810 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8811 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8812 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8813 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8814 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8815 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8816 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8817 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8818 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8819}
8820
8821IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8822 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8823{
8824 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8825 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8826 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8827 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8828 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8829 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8830 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8831 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8832 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8833 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8834 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8835 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8836 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8837 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8838 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8839 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8840 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8841 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8842 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8843 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8844 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8845 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8846 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8847 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8848 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8849 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8850 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8851 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8852 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8853 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8854 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8855 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8856}
8857
8858
8859/*
8860 * PADDW / VPADDW
8861 */
8862#ifdef IEM_WITHOUT_ASSEMBLY
8863
8864IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8865{
8866 RT_NOREF(pFpuState);
8867 RTUINT64U uSrc1 = { *puDst };
8868 RTUINT64U uSrc2 = { *puSrc };
8869 RTUINT64U uDst;
8870 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8871 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8872 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8873 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8874 *puDst = uDst.u;
8875}
8876
8877
8878IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8879{
8880 RT_NOREF(pFpuState);
8881 RTUINT128U uSrc1 = *puDst;
8882 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8883 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8884 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8885 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8886 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8887 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8888 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8889 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8890}
8891
8892#endif
8893
8894
8895IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8896 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8897{
8898 RT_NOREF(pExtState);
8899 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8900 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8901 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8902 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8903 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8904 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8905 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8906 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8907}
8908
8909IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8910 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8911{
8912 RT_NOREF(pExtState);
8913 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8914 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8915 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8916 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8917 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8918 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8919 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8920 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8921 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8922 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8923 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8924 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8925 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8926 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8927 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8928 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8929}
8930
8931
8932/*
8933 * PADDSW / VPADDSW
8934 */
8935#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8936 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8937 ? (uint16_t)(a_iDword) \
8938 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8939
8940#ifdef IEM_WITHOUT_ASSEMBLY
8941
8942IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8943{
8944 RT_NOREF(pFpuState);
8945 RTUINT64U uSrc1 = { *puDst };
8946 RTUINT64U uSrc2 = { *puSrc };
8947 RTUINT64U uDst;
8948 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8949 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8950 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8951 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8952 *puDst = uDst.u;
8953}
8954
8955
8956IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8957{
8958 RT_NOREF(pFpuState);
8959 RTUINT128U uSrc1 = *puDst;
8960 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8961 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8962 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8963 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8964 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8965 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8966 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8967 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8968}
8969
8970#endif
8971
8972IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8973 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8974{
8975 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8976 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8977 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8978 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8979 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8980 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8981 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8982 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8983}
8984
8985IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8986 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8987{
8988 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8989 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8990 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8991 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8992 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8993 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8994 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8995 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8996 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8997 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8998 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8999 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
9000 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
9001 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
9002 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
9003 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
9004}
9005
9006
9007/*
9008 * PADDUSW / VPADDUSW
9009 */
9010#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
9011 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9012 ? (uint16_t)(a_uDword) \
9013 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
9014
9015#ifdef IEM_WITHOUT_ASSEMBLY
9016
9017IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9018{
9019 RT_NOREF(pFpuState);
9020 RTUINT64U uSrc1 = { *puDst };
9021 RTUINT64U uSrc2 = { *puSrc };
9022 RTUINT64U uDst;
9023 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
9024 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
9025 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
9026 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
9027 *puDst = uDst.u;
9028}
9029
9030
9031IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9032{
9033 RT_NOREF(pFpuState);
9034 RTUINT128U uSrc1 = *puDst;
9035 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
9036 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
9037 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
9038 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
9039 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
9040 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
9041 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
9042 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
9043}
9044
9045#endif
9046
9047IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
9048 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9049{
9050 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9051 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9052 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9053 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9054 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9055 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9056 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9057 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9058}
9059
9060IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
9061 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9062{
9063 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9064 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9065 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9066 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9067 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9068 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9069 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9070 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9071 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9072 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9073 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9074 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9075 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9076 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9077 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9078 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9079}
9080
9081
9082/*
9083 * PADDD / VPADDD.
9084 */
9085#ifdef IEM_WITHOUT_ASSEMBLY
9086
9087IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9088{
9089 RT_NOREF(pFpuState);
9090 RTUINT64U uSrc1 = { *puDst };
9091 RTUINT64U uSrc2 = { *puSrc };
9092 RTUINT64U uDst;
9093 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9094 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9095 *puDst = uDst.u;
9096}
9097
9098
9099IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9100{
9101 RT_NOREF(pFpuState);
9102 RTUINT128U uSrc1 = *puDst;
9103 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9104 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9105 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9106 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9107}
9108
9109#endif /* IEM_WITHOUT_ASSEMBLY */
9110
9111IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9112 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9113{
9114 RT_NOREF(pExtState);
9115 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9116 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9117 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9118 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9119}
9120
9121IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9122 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9123{
9124 RT_NOREF(pExtState);
9125 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9126 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9127 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9128 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9129 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9130 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9131 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9132 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9133}
9134
9135
9136/*
9137 * PADDQ / VPADDQ.
9138 */
9139#ifdef IEM_WITHOUT_ASSEMBLY
9140
9141IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9142{
9143 RT_NOREF(pFpuState);
9144 *puDst = *puDst + *puSrc;
9145}
9146
9147IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9148{
9149 RT_NOREF(pFpuState);
9150 RTUINT128U uSrc1 = *puDst;
9151 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9152 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9153}
9154
9155#endif
9156
9157IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9158 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9159{
9160 RT_NOREF(pExtState);
9161 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9162 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9163}
9164
9165IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9166 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9167{
9168 RT_NOREF(pExtState);
9169 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9170 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9171 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9172 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9173}
9174
9175
9176/*
9177 * PSUBB / VPSUBB
9178 */
9179#ifdef IEM_WITHOUT_ASSEMBLY
9180
9181IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9182{
9183 RT_NOREF(pFpuState);
9184 RTUINT64U uSrc1 = { *puDst };
9185 RTUINT64U uSrc2 = { *puSrc };
9186 RTUINT64U uDst;
9187 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9188 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9189 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9190 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9191 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9192 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9193 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9194 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9195 *puDst = uDst.u;
9196}
9197
9198
9199IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9200{
9201 RT_NOREF(pFpuState);
9202 RTUINT128U uSrc1 = *puDst;
9203 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9204 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9205 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9206 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9207 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9208 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9209 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9210 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9211 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9212 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9213 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9214 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9215 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9216 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9217 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9218 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9219}
9220
9221#endif
9222
9223IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9224 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9225{
9226 RT_NOREF(pExtState);
9227 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9228 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9229 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9230 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9231 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9232 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9233 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9234 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9235 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9236 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9237 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9238 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9239 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9240 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9241 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9242 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9243}
9244
9245IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9246 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9247{
9248 RT_NOREF(pExtState);
9249 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9250 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9251 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9252 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9253 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9254 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9255 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9256 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9257 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9258 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9259 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9260 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9261 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9262 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9263 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9264 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9265 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9266 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9267 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9268 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9269 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9270 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9271 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9272 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9273 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9274 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9275 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9276 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9277 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9278 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9279 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9280 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9281}
9282
9283
9284/*
9285 * PSUBSB / VSUBSB
9286 */
9287#ifdef IEM_WITHOUT_ASSEMBLY
9288
9289IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9290{
9291 RT_NOREF(pFpuState);
9292 RTUINT64U uSrc1 = { *puDst };
9293 RTUINT64U uSrc2 = { *puSrc };
9294 RTUINT64U uDst;
9295 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9296 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9297 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9298 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9299 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9300 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9301 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9302 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9303 *puDst = uDst.u;
9304}
9305
9306
9307IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9308{
9309 RT_NOREF(pFpuState);
9310 RTUINT128U uSrc1 = *puDst;
9311 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9312 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9313 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9314 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9315 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9316 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9317 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9318 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9319 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9320 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9321 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9322 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9323 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9324 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9325 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9326 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9327}
9328
9329#endif
9330
9331IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9332 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9333{
9334 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9335 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9336 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9337 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9338 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9339 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9340 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9341 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9342 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9343 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9344 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9345 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9346 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9347 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9348 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9349 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9350}
9351
9352IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9353 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9354{
9355 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9356 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9357 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9358 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9359 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9360 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9361 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9362 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9363 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9364 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9365 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9366 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9367 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9368 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9369 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9370 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9371 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9372 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9373 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9374 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9375 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9376 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9377 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9378 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9379 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9380 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9381 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9382 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9383 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9384 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9385 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9386 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9387}
9388
9389
9390/*
9391 * PSUBUSB / VPSUBUSW
9392 */
9393#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9394 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9395 ? (uint8_t)(a_uWord) \
9396 : (uint8_t)0 )
9397
9398#ifdef IEM_WITHOUT_ASSEMBLY
9399
9400IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9401{
9402 RT_NOREF(pFpuState);
9403 RTUINT64U uSrc1 = { *puDst };
9404 RTUINT64U uSrc2 = { *puSrc };
9405 RTUINT64U uDst;
9406 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9407 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9408 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9409 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9410 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9411 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9412 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9413 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9414 *puDst = uDst.u;
9415}
9416
9417
9418IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9419{
9420 RT_NOREF(pFpuState);
9421 RTUINT128U uSrc1 = *puDst;
9422 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9423 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9424 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9425 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9426 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9427 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9428 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9429 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9430 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9431 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9432 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9433 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9434 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9435 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9436 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9437 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9438}
9439
9440#endif
9441
9442IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9443 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9444{
9445 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9446 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9447 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9448 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9449 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9450 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9451 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9452 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9453 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9454 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9455 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9456 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9457 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9458 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9459 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9460 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9461}
9462
9463IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9464 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9465{
9466 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9467 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9468 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9469 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9470 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9471 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9472 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9473 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9474 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9475 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9476 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9477 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9478 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9479 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9480 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9481 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9482 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9483 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9484 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9485 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9486 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9487 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9488 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9489 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9490 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9491 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9492 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9493 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9494 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9495 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9496 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9497 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9498}
9499
9500
9501/*
9502 * PSUBW / VPSUBW
9503 */
9504#ifdef IEM_WITHOUT_ASSEMBLY
9505
9506IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9507{
9508 RT_NOREF(pFpuState);
9509 RTUINT64U uSrc1 = { *puDst };
9510 RTUINT64U uSrc2 = { *puSrc };
9511 RTUINT64U uDst;
9512 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9513 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9514 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9515 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9516 *puDst = uDst.u;
9517}
9518
9519
9520IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9521{
9522 RT_NOREF(pFpuState);
9523 RTUINT128U uSrc1 = *puDst;
9524 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9525 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9526 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9527 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9528 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9529 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9530 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9531 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9532}
9533
9534#endif
9535
9536IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9537 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9538{
9539 RT_NOREF(pExtState);
9540 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9541 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9542 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9543 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9544 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9545 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9546 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9547 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9548}
9549
9550IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9551 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9552{
9553 RT_NOREF(pExtState);
9554 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9555 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9556 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9557 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9558 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9559 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9560 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9561 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9562 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9563 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9564 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9565 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9566 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9567 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9568 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9569 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9570}
9571
9572
9573/*
9574 * PSUBSW / VPSUBSW
9575 */
9576#ifdef IEM_WITHOUT_ASSEMBLY
9577
9578IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9579{
9580 RT_NOREF(pFpuState);
9581 RTUINT64U uSrc1 = { *puDst };
9582 RTUINT64U uSrc2 = { *puSrc };
9583 RTUINT64U uDst;
9584 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9585 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9586 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9587 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9588 *puDst = uDst.u;
9589}
9590
9591
9592IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9593{
9594 RT_NOREF(pFpuState);
9595 RTUINT128U uSrc1 = *puDst;
9596 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9597 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9598 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9599 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9600 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9601 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9602 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9603 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9604}
9605
9606#endif
9607
9608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9609 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9610{
9611 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9612 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9613 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9614 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9615 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9616 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9617 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9618 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9619}
9620
9621IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9622 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9623{
9624 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9625 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9626 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9627 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9628 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9629 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9630 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9631 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9632 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9633 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9634 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9635 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9636 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9637 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9638 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9639 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9640}
9641
9642
9643/*
9644 * PSUBUSW / VPSUBUSW
9645 */
9646#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9647 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9648 ? (uint16_t)(a_uDword) \
9649 : (uint16_t)0 )
9650
9651#ifdef IEM_WITHOUT_ASSEMBLY
9652
9653IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9654{
9655 RT_NOREF(pFpuState);
9656 RTUINT64U uSrc1 = { *puDst };
9657 RTUINT64U uSrc2 = { *puSrc };
9658 RTUINT64U uDst;
9659 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9660 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9661 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9662 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9663 *puDst = uDst.u;
9664}
9665
9666
9667IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9668{
9669 RT_NOREF(pFpuState);
9670 RTUINT128U uSrc1 = *puDst;
9671 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9672 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9673 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9674 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9675 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9676 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9677 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9678 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9679}
9680
9681#endif
9682
9683IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9684 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9685{
9686 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9687 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9688 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9689 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9690 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9691 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9692 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9693 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9694}
9695
9696IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9697 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9698{
9699 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9700 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9701 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9702 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9703 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9704 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9705 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9706 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9707 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9708 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9709 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9710 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9711 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9712 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9713 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9714 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9715}
9716
9717
9718
9719/*
9720 * PSUBD / VPSUBD.
9721 */
9722#ifdef IEM_WITHOUT_ASSEMBLY
9723
9724IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9725{
9726 RT_NOREF(pFpuState);
9727 RTUINT64U uSrc1 = { *puDst };
9728 RTUINT64U uSrc2 = { *puSrc };
9729 RTUINT64U uDst;
9730 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9731 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9732 *puDst = uDst.u;
9733}
9734
9735
9736IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9737{
9738 RT_NOREF(pFpuState);
9739 RTUINT128U uSrc1 = *puDst;
9740 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9741 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9742 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9743 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9744}
9745
9746#endif /* IEM_WITHOUT_ASSEMBLY */
9747
9748IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9749 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9750{
9751 RT_NOREF(pExtState);
9752 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9753 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9754 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9755 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9756}
9757
9758IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9759 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9760{
9761 RT_NOREF(pExtState);
9762 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9763 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9764 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9765 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9766 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9767 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9768 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9769 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9770}
9771
9772
9773/*
9774 * PSUBQ / VPSUBQ.
9775 */
9776#ifdef IEM_WITHOUT_ASSEMBLY
9777
9778IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9779{
9780 RT_NOREF(pFpuState);
9781 *puDst = *puDst - *puSrc;
9782}
9783
9784IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9785{
9786 RT_NOREF(pFpuState);
9787 RTUINT128U uSrc1 = *puDst;
9788 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9789 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9790}
9791
9792#endif
9793
9794IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9795 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9796{
9797 RT_NOREF(pExtState);
9798 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9799 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9800}
9801
9802IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9803 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9804{
9805 RT_NOREF(pExtState);
9806 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9807 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9808 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9809 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9810}
9811
9812
9813
9814/*
9815 * PMULLW / VPMULLW / PMULLD / VPMULLD
9816 */
9817#ifdef IEM_WITHOUT_ASSEMBLY
9818
9819IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9820{
9821 RT_NOREF(pFpuState);
9822 RTUINT64U uSrc1 = { *puDst };
9823 RTUINT64U uSrc2 = { *puSrc };
9824 RTUINT64U uDst;
9825 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9826 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9827 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9828 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9829 *puDst = uDst.u;
9830}
9831
9832
9833IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9834{
9835 RT_NOREF(pFpuState);
9836 RTUINT128U uSrc1 = *puDst;
9837 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9838 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9839 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9840 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9841 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9842 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9843 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9844 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9845}
9846
9847#endif
9848
9849IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9850{
9851 RTUINT128U uSrc1 = *puDst;
9852
9853 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9854 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9855 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9856 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9857 RT_NOREF(pFpuState);
9858}
9859
9860
9861IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9862{
9863 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9864 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9865 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9866 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9867 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9868 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9869 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9870 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9871}
9872
9873
9874IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9875{
9876 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9877 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9878 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9879 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9880 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9881 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9882 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9883 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9884 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9885 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9886 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9887 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9888 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9889 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9890 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9891 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9892}
9893
9894
9895IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9896{
9897 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9898 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9899 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9900 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9901}
9902
9903
9904IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9905{
9906 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9907 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9908 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9909 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9910 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9911 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9912 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9913 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9914}
9915
9916
9917/*
9918 * PMULHW / VPMULHW
9919 */
9920#ifdef IEM_WITHOUT_ASSEMBLY
9921
9922IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9923{
9924 RT_NOREF(pFpuState);
9925 RTUINT64U uSrc1 = { *puDst };
9926 RTUINT64U uSrc2 = { *puSrc };
9927 RTUINT64U uDst;
9928 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9929 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9930 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9931 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9932 *puDst = uDst.u;
9933}
9934
9935
9936IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9937{
9938 RT_NOREF(pFpuState);
9939 RTUINT128U uSrc1 = *puDst;
9940 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9941 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9942 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9943 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9944 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9945 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9946 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9947 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9948}
9949
9950#endif
9951
9952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9953{
9954 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9955 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9956 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9957 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9958 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9959 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9960 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9961 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9962}
9963
9964
9965IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9966{
9967 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9968 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9969 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9970 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9971 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9972 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9973 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9974 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9975 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9976 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9977 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9978 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9979 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9980 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9981 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9982 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9983}
9984
9985
9986/*
9987 * PMULHUW / VPMULHUW
9988 */
9989#ifdef IEM_WITHOUT_ASSEMBLY
9990
9991IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9992{
9993 RTUINT64U uSrc1 = { *puDst };
9994 RTUINT64U uSrc2 = { *puSrc };
9995 RTUINT64U uDst;
9996 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9997 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9998 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9999 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
10000 *puDst = uDst.u;
10001}
10002
10003
10004IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10005{
10006 RTUINT128U uSrc1 = *puDst;
10007 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
10008 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
10009 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
10010 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
10011 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
10012 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
10013 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
10014 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
10015}
10016
10017#endif
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10020{
10021 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
10022 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
10023 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
10024 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
10025 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
10026 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
10027 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
10028 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
10029}
10030
10031
10032IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10033{
10034 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
10035 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
10036 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
10037 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
10038 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
10039 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
10040 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
10041 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
10042 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
10043 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
10044 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
10045 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
10046 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
10047 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
10048 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
10049 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
10050}
10051
10052
10053/*
10054 * PSRLW / VPSRLW
10055 */
10056#ifdef IEM_WITHOUT_ASSEMBLY
10057
10058IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10059{
10060 RTUINT64U uSrc1 = { *puDst };
10061 RTUINT64U uSrc2 = { *puSrc };
10062 RTUINT64U uDst;
10063
10064 if (uSrc2.au64[0] <= 15)
10065 {
10066 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
10067 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
10068 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
10069 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
10070 }
10071 else
10072 {
10073 uDst.au64[0] = 0;
10074 }
10075 *puDst = uDst.u;
10076}
10077
10078
10079IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10080{
10081 RTUINT64U uSrc1 = { *puDst };
10082 RTUINT64U uDst;
10083
10084 if (uShift <= 15)
10085 {
10086 uDst.au16[0] = uSrc1.au16[0] >> uShift;
10087 uDst.au16[1] = uSrc1.au16[1] >> uShift;
10088 uDst.au16[2] = uSrc1.au16[2] >> uShift;
10089 uDst.au16[3] = uSrc1.au16[3] >> uShift;
10090 }
10091 else
10092 {
10093 uDst.au64[0] = 0;
10094 }
10095 *puDst = uDst.u;
10096}
10097
10098
10099IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10100{
10101 RTUINT128U uSrc1 = *puDst;
10102
10103 if (puSrc->au64[0] <= 15)
10104 {
10105 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
10106 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
10107 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
10108 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
10109 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
10110 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
10111 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
10112 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
10113 }
10114 else
10115 {
10116 puDst->au64[0] = 0;
10117 puDst->au64[1] = 0;
10118 }
10119}
10120
10121IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10122{
10123 RTUINT128U uSrc1 = *puDst;
10124
10125 if (uShift <= 15)
10126 {
10127 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10128 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10129 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10130 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10131 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10132 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10133 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10134 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10135 }
10136 else
10137 {
10138 puDst->au64[0] = 0;
10139 puDst->au64[1] = 0;
10140 }
10141}
10142
10143#endif
10144
10145IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10146{
10147 RTUINT128U uSrc1 = *puSrc1;
10148
10149 if (uShift <= 15)
10150 {
10151 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10152 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10153 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10154 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10155 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10156 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10157 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10158 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10159 }
10160 else
10161 {
10162 puDst->au64[0] = 0;
10163 puDst->au64[1] = 0;
10164 }
10165}
10166
10167IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10168{
10169 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10170}
10171
10172IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10173{
10174 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10175}
10176
10177IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10178{
10179 RTUINT256U uSrc1 = *puSrc1;
10180
10181 if (uShift <= 15)
10182 {
10183 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10184 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10185 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10186 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10187 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10188 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10189 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10190 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10191 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10192 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10193 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10194 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10195 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10196 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10197 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10198 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10199 }
10200 else
10201 {
10202 puDst->au64[0] = 0;
10203 puDst->au64[1] = 0;
10204 puDst->au64[2] = 0;
10205 puDst->au64[3] = 0;
10206 }
10207}
10208
10209IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10210{
10211 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10212}
10213
10214IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10215{
10216 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10217}
10218
10219
10220/*
10221 * PSRAW / VPSRAW
10222 */
10223#ifdef IEM_WITHOUT_ASSEMBLY
10224
10225IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10226{
10227 RTUINT64U uSrc1 = { *puDst };
10228 RTUINT64U uSrc2 = { *puSrc };
10229 RTUINT64U uDst;
10230 uint8_t uShift;
10231
10232 uShift = RT_MIN(15, uSrc2.au64[0]);
10233
10234 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10235 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10236 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10237 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10238
10239 *puDst = uDst.u;
10240}
10241
10242
10243IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10244{
10245 RTUINT64U uSrc1 = { *puDst };
10246 RTUINT64U uDst;
10247
10248 uShift = RT_MIN(15, uShift);
10249
10250 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10251 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10252 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10253 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10254
10255 *puDst = uDst.u;
10256}
10257
10258
10259IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10260{
10261 RTUINT128U uSrc1 = *puDst;
10262 uint8_t uShift;
10263
10264 uShift = RT_MIN(15, puSrc->au64[0]);
10265
10266 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10267 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10268 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10269 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10270 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10271 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10272 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10273 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10274}
10275
10276IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10277{
10278 RTUINT128U uSrc1 = *puDst;
10279
10280 uShift = RT_MIN(15, uShift);
10281
10282 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10283 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10284 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10285 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10286 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10287 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10288 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10289 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10290}
10291
10292#endif
10293
10294IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10295{
10296 RTUINT128U uSrc1 = *puSrc1;
10297
10298 uShift = RT_MIN(15, uShift);
10299
10300 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10301 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10302 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10303 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10304 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10305 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10306 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10307 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10308}
10309
10310IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10311{
10312 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10313}
10314
10315IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10316{
10317 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10318}
10319
10320IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10321{
10322 RTUINT256U uSrc1 = *puSrc1;
10323
10324 uShift = RT_MIN(15, uShift);
10325
10326 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10327 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10328 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10329 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10330 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10331 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10332 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10333 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10334 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10335 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10336 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10337 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10338 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10339 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10340 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10341 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10342}
10343
10344IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10345{
10346 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10347}
10348
10349IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10350{
10351 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10352}
10353
10354
10355/*
10356 * PSLLW / VPSLLW
10357 */
10358#ifdef IEM_WITHOUT_ASSEMBLY
10359
10360IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10361{
10362 RTUINT64U uSrc1 = { *puDst };
10363 RTUINT64U uSrc2 = { *puSrc };
10364 RTUINT64U uDst;
10365
10366 if (uSrc2.au64[0] <= 15)
10367 {
10368 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10369 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10370 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10371 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10372 }
10373 else
10374 {
10375 uDst.au64[0] = 0;
10376 }
10377 *puDst = uDst.u;
10378}
10379
10380
10381IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10382{
10383 RTUINT64U uSrc1 = { *puDst };
10384 RTUINT64U uDst;
10385
10386 if (uShift <= 15)
10387 {
10388 uDst.au16[0] = uSrc1.au16[0] << uShift;
10389 uDst.au16[1] = uSrc1.au16[1] << uShift;
10390 uDst.au16[2] = uSrc1.au16[2] << uShift;
10391 uDst.au16[3] = uSrc1.au16[3] << uShift;
10392 }
10393 else
10394 {
10395 uDst.au64[0] = 0;
10396 }
10397 *puDst = uDst.u;
10398}
10399
10400
10401IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10402{
10403 RTUINT128U uSrc1 = *puDst;
10404
10405 if (puSrc->au64[0] <= 15)
10406 {
10407 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10408 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10409 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10410 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10411 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10412 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10413 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10414 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10415 }
10416 else
10417 {
10418 puDst->au64[0] = 0;
10419 puDst->au64[1] = 0;
10420 }
10421}
10422
10423IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10424{
10425 RTUINT128U uSrc1 = *puDst;
10426
10427 if (uShift <= 15)
10428 {
10429 puDst->au16[0] = uSrc1.au16[0] << uShift;
10430 puDst->au16[1] = uSrc1.au16[1] << uShift;
10431 puDst->au16[2] = uSrc1.au16[2] << uShift;
10432 puDst->au16[3] = uSrc1.au16[3] << uShift;
10433 puDst->au16[4] = uSrc1.au16[4] << uShift;
10434 puDst->au16[5] = uSrc1.au16[5] << uShift;
10435 puDst->au16[6] = uSrc1.au16[6] << uShift;
10436 puDst->au16[7] = uSrc1.au16[7] << uShift;
10437 }
10438 else
10439 {
10440 puDst->au64[0] = 0;
10441 puDst->au64[1] = 0;
10442 }
10443}
10444
10445#endif
10446
10447IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10448{
10449 RTUINT128U uSrc1 = *puSrc1;
10450
10451 if (uShift <= 15)
10452 {
10453 puDst->au16[0] = uSrc1.au16[0] << uShift;
10454 puDst->au16[1] = uSrc1.au16[1] << uShift;
10455 puDst->au16[2] = uSrc1.au16[2] << uShift;
10456 puDst->au16[3] = uSrc1.au16[3] << uShift;
10457 puDst->au16[4] = uSrc1.au16[4] << uShift;
10458 puDst->au16[5] = uSrc1.au16[5] << uShift;
10459 puDst->au16[6] = uSrc1.au16[6] << uShift;
10460 puDst->au16[7] = uSrc1.au16[7] << uShift;
10461 }
10462 else
10463 {
10464 puDst->au64[0] = 0;
10465 puDst->au64[1] = 0;
10466 }
10467}
10468
10469IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10470{
10471 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10472}
10473
10474IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10475{
10476 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10477}
10478
10479IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10480{
10481 RTUINT256U uSrc1 = *puSrc1;
10482
10483 if (uShift <= 15)
10484 {
10485 puDst->au16[0] = uSrc1.au16[0] << uShift;
10486 puDst->au16[1] = uSrc1.au16[1] << uShift;
10487 puDst->au16[2] = uSrc1.au16[2] << uShift;
10488 puDst->au16[3] = uSrc1.au16[3] << uShift;
10489 puDst->au16[4] = uSrc1.au16[4] << uShift;
10490 puDst->au16[5] = uSrc1.au16[5] << uShift;
10491 puDst->au16[6] = uSrc1.au16[6] << uShift;
10492 puDst->au16[7] = uSrc1.au16[7] << uShift;
10493 puDst->au16[8] = uSrc1.au16[8] << uShift;
10494 puDst->au16[9] = uSrc1.au16[9] << uShift;
10495 puDst->au16[10] = uSrc1.au16[10] << uShift;
10496 puDst->au16[11] = uSrc1.au16[11] << uShift;
10497 puDst->au16[12] = uSrc1.au16[12] << uShift;
10498 puDst->au16[13] = uSrc1.au16[13] << uShift;
10499 puDst->au16[14] = uSrc1.au16[14] << uShift;
10500 puDst->au16[15] = uSrc1.au16[15] << uShift;
10501 }
10502 else
10503 {
10504 puDst->au64[0] = 0;
10505 puDst->au64[1] = 0;
10506 puDst->au64[2] = 0;
10507 puDst->au64[3] = 0;
10508 }
10509}
10510
10511IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10512{
10513 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10514}
10515
10516IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10517{
10518 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10519}
10520
10521/*
10522 * PSRLD / VPSRLD
10523 */
10524#ifdef IEM_WITHOUT_ASSEMBLY
10525
10526IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10527{
10528 RTUINT64U uSrc1 = { *puDst };
10529 RTUINT64U uSrc2 = { *puSrc };
10530 RTUINT64U uDst;
10531
10532 if (uSrc2.au64[0] <= 31)
10533 {
10534 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10535 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10536 }
10537 else
10538 {
10539 uDst.au64[0] = 0;
10540 }
10541 *puDst = uDst.u;
10542}
10543
10544
10545IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10546{
10547 RTUINT64U uSrc1 = { *puDst };
10548 RTUINT64U uDst;
10549
10550 if (uShift <= 31)
10551 {
10552 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10553 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10554 }
10555 else
10556 {
10557 uDst.au64[0] = 0;
10558 }
10559 *puDst = uDst.u;
10560}
10561
10562
10563IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10564{
10565 RTUINT128U uSrc1 = *puDst;
10566
10567 if (puSrc->au64[0] <= 31)
10568 {
10569 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10570 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10571 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10572 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10573 }
10574 else
10575 {
10576 puDst->au64[0] = 0;
10577 puDst->au64[1] = 0;
10578 }
10579}
10580
10581IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10582{
10583 RTUINT128U uSrc1 = *puDst;
10584
10585 if (uShift <= 31)
10586 {
10587 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10588 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10589 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10590 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10591 }
10592 else
10593 {
10594 puDst->au64[0] = 0;
10595 puDst->au64[1] = 0;
10596 }
10597}
10598
10599#endif
10600
10601IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10602{
10603 RTUINT128U uSrc1 = *puSrc1;
10604
10605 if (uShift <= 31)
10606 {
10607 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10608 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10609 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10610 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10611 }
10612 else
10613 {
10614 puDst->au64[0] = 0;
10615 puDst->au64[1] = 0;
10616 }
10617}
10618
10619IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10620{
10621 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10622}
10623
10624IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10625{
10626 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10627}
10628
10629IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10630{
10631 RTUINT256U uSrc1 = *puSrc1;
10632
10633 if (uShift <= 31)
10634 {
10635 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10636 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10637 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10638 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10639 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10640 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10641 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10642 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10643 }
10644 else
10645 {
10646 puDst->au64[0] = 0;
10647 puDst->au64[1] = 0;
10648 puDst->au64[2] = 0;
10649 puDst->au64[3] = 0;
10650 }
10651}
10652
10653IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10654{
10655 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10656}
10657
10658IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10659{
10660 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10661}
10662
10663
10664/*
10665 * PSRAD / VPSRAD
10666 */
10667#ifdef IEM_WITHOUT_ASSEMBLY
10668
10669IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10670{
10671 RTUINT64U uSrc1 = { *puDst };
10672 RTUINT64U uSrc2 = { *puSrc };
10673 RTUINT64U uDst;
10674 uint8_t uShift;
10675
10676 uShift = RT_MIN(31, uSrc2.au64[0]);
10677
10678 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10679 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10680
10681 *puDst = uDst.u;
10682}
10683
10684
10685IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10686{
10687 RTUINT64U uSrc1 = { *puDst };
10688 RTUINT64U uDst;
10689
10690 uShift = RT_MIN(31, uShift);
10691
10692 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10693 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10694
10695 *puDst = uDst.u;
10696}
10697
10698
10699IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10700{
10701 RTUINT128U uSrc1 = *puDst;
10702 uint8_t uShift;
10703
10704 uShift = RT_MIN(31, puSrc->au64[0]);
10705
10706 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10707 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10708 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10709 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10710}
10711
10712IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10713{
10714 RTUINT128U uSrc1 = *puDst;
10715
10716 uShift = RT_MIN(31, uShift);
10717
10718 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10719 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10720 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10721 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10722}
10723
10724#endif
10725
10726IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10727{
10728 RTUINT128U uSrc1 = *puSrc1;
10729
10730 uShift = RT_MIN(31, uShift);
10731
10732 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10733 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10734 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10735 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10736}
10737
10738IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10739{
10740 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10741}
10742
10743IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10744{
10745 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10746}
10747
10748IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10749{
10750 RTUINT256U uSrc1 = *puSrc1;
10751
10752 uShift = RT_MIN(31, uShift);
10753
10754 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10755 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10756 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10757 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10758 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10759 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10760 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10761 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10762}
10763
10764IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10765{
10766 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10767}
10768
10769IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10770{
10771 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10772}
10773
10774
10775/*
10776 * PSLLD / VPSLLD
10777 */
10778#ifdef IEM_WITHOUT_ASSEMBLY
10779
10780IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10781{
10782 RTUINT64U uSrc1 = { *puDst };
10783 RTUINT64U uSrc2 = { *puSrc };
10784 RTUINT64U uDst;
10785
10786 if (uSrc2.au64[0] <= 31)
10787 {
10788 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10789 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10790 }
10791 else
10792 {
10793 uDst.au64[0] = 0;
10794 }
10795 *puDst = uDst.u;
10796}
10797
10798
10799IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10800{
10801 RTUINT64U uSrc1 = { *puDst };
10802 RTUINT64U uDst;
10803
10804 if (uShift <= 31)
10805 {
10806 uDst.au32[0] = uSrc1.au32[0] << uShift;
10807 uDst.au32[1] = uSrc1.au32[1] << uShift;
10808 }
10809 else
10810 {
10811 uDst.au64[0] = 0;
10812 }
10813 *puDst = uDst.u;
10814}
10815
10816
10817IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10818{
10819 RTUINT128U uSrc1 = *puDst;
10820
10821 if (puSrc->au64[0] <= 31)
10822 {
10823 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10824 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10825 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10826 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10827 }
10828 else
10829 {
10830 puDst->au64[0] = 0;
10831 puDst->au64[1] = 0;
10832 }
10833}
10834
10835IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10836{
10837 RTUINT128U uSrc1 = *puDst;
10838
10839 if (uShift <= 31)
10840 {
10841 puDst->au32[0] = uSrc1.au32[0] << uShift;
10842 puDst->au32[1] = uSrc1.au32[1] << uShift;
10843 puDst->au32[2] = uSrc1.au32[2] << uShift;
10844 puDst->au32[3] = uSrc1.au32[3] << uShift;
10845 }
10846 else
10847 {
10848 puDst->au64[0] = 0;
10849 puDst->au64[1] = 0;
10850 }
10851}
10852
10853#endif
10854
10855IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10856{
10857 RTUINT128U uSrc1 = *puSrc1;
10858
10859 if (uShift <= 31)
10860 {
10861 puDst->au32[0] = uSrc1.au32[0] << uShift;
10862 puDst->au32[1] = uSrc1.au32[1] << uShift;
10863 puDst->au32[2] = uSrc1.au32[2] << uShift;
10864 puDst->au32[3] = uSrc1.au32[3] << uShift;
10865 }
10866 else
10867 {
10868 puDst->au64[0] = 0;
10869 puDst->au64[1] = 0;
10870 }
10871}
10872
10873IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10874{
10875 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10876}
10877
10878IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10879{
10880 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10881}
10882
10883IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10884{
10885 RTUINT256U uSrc1 = *puSrc1;
10886
10887 if (uShift <= 31)
10888 {
10889 puDst->au32[0] = uSrc1.au32[0] << uShift;
10890 puDst->au32[1] = uSrc1.au32[1] << uShift;
10891 puDst->au32[2] = uSrc1.au32[2] << uShift;
10892 puDst->au32[3] = uSrc1.au32[3] << uShift;
10893 puDst->au32[4] = uSrc1.au32[4] << uShift;
10894 puDst->au32[5] = uSrc1.au32[5] << uShift;
10895 puDst->au32[6] = uSrc1.au32[6] << uShift;
10896 puDst->au32[7] = uSrc1.au32[7] << uShift;
10897 }
10898 else
10899 {
10900 puDst->au64[0] = 0;
10901 puDst->au64[1] = 0;
10902 puDst->au64[2] = 0;
10903 puDst->au64[3] = 0;
10904 }
10905}
10906
10907IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10908{
10909 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10910}
10911
10912IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10913{
10914 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10915}
10916
10917
10918/*
10919 * PSRLQ / VPSRLQ
10920 */
10921#ifdef IEM_WITHOUT_ASSEMBLY
10922
10923IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10924{
10925 RTUINT64U uSrc1 = { *puDst };
10926 RTUINT64U uSrc2 = { *puSrc };
10927 RTUINT64U uDst;
10928
10929 if (uSrc2.au64[0] <= 63)
10930 {
10931 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10932 }
10933 else
10934 {
10935 uDst.au64[0] = 0;
10936 }
10937 *puDst = uDst.u;
10938}
10939
10940
10941IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10942{
10943 RTUINT64U uSrc1 = { *puDst };
10944 RTUINT64U uDst;
10945
10946 if (uShift <= 63)
10947 {
10948 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10949 }
10950 else
10951 {
10952 uDst.au64[0] = 0;
10953 }
10954 *puDst = uDst.u;
10955}
10956
10957
10958IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10959{
10960 RTUINT128U uSrc1 = *puDst;
10961
10962 if (puSrc->au64[0] <= 63)
10963 {
10964 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10965 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10966 }
10967 else
10968 {
10969 puDst->au64[0] = 0;
10970 puDst->au64[1] = 0;
10971 }
10972}
10973
10974IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10975{
10976 RTUINT128U uSrc1 = *puDst;
10977
10978 if (uShift <= 63)
10979 {
10980 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10981 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10982 }
10983 else
10984 {
10985 puDst->au64[0] = 0;
10986 puDst->au64[1] = 0;
10987 }
10988}
10989
10990#endif
10991
10992IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10993{
10994 RTUINT128U uSrc1 = *puSrc1;
10995
10996 if (uShift <= 63)
10997 {
10998 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10999 puDst->au64[1] = uSrc1.au64[1] >> uShift;
11000 }
11001 else
11002 {
11003 puDst->au64[0] = 0;
11004 puDst->au64[1] = 0;
11005 }
11006}
11007
11008IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11009{
11010 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
11011}
11012
11013IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11014{
11015 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11016}
11017
11018IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11019{
11020 RTUINT256U uSrc1 = *puSrc1;
11021
11022 if (uShift <= 63)
11023 {
11024 puDst->au64[0] = uSrc1.au64[0] >> uShift;
11025 puDst->au64[1] = uSrc1.au64[1] >> uShift;
11026 puDst->au64[2] = uSrc1.au64[2] >> uShift;
11027 puDst->au64[3] = uSrc1.au64[3] >> uShift;
11028 }
11029 else
11030 {
11031 puDst->au64[0] = 0;
11032 puDst->au64[1] = 0;
11033 puDst->au64[2] = 0;
11034 puDst->au64[3] = 0;
11035 }
11036}
11037
11038IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11039{
11040 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11041}
11042
11043IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11044{
11045 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
11046}
11047
11048
11049/*
11050 * PSLLQ / VPSLLQ
11051 */
11052#ifdef IEM_WITHOUT_ASSEMBLY
11053
11054IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11055{
11056 RTUINT64U uSrc1 = { *puDst };
11057 RTUINT64U uSrc2 = { *puSrc };
11058 RTUINT64U uDst;
11059
11060 if (uSrc2.au64[0] <= 63)
11061 {
11062 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
11063 }
11064 else
11065 {
11066 uDst.au64[0] = 0;
11067 }
11068 *puDst = uDst.u;
11069}
11070
11071
11072IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
11073{
11074 RTUINT64U uSrc1 = { *puDst };
11075 RTUINT64U uDst;
11076
11077 if (uShift <= 63)
11078 {
11079 uDst.au64[0] = uSrc1.au64[0] << uShift;
11080 }
11081 else
11082 {
11083 uDst.au64[0] = 0;
11084 }
11085 *puDst = uDst.u;
11086}
11087
11088
11089IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11090{
11091 RTUINT128U uSrc1 = *puDst;
11092
11093 if (puSrc->au64[0] <= 63)
11094 {
11095 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
11096 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
11097 }
11098 else
11099 {
11100 puDst->au64[0] = 0;
11101 puDst->au64[1] = 0;
11102 }
11103}
11104
11105IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11106{
11107 RTUINT128U uSrc1 = *puDst;
11108
11109 if (uShift <= 63)
11110 {
11111 puDst->au64[0] = uSrc1.au64[0] << uShift;
11112 puDst->au64[1] = uSrc1.au64[1] << uShift;
11113 }
11114 else
11115 {
11116 puDst->au64[0] = 0;
11117 puDst->au64[1] = 0;
11118 }
11119}
11120
11121#endif
11122
11123IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11124{
11125 RTUINT128U uSrc1 = *puSrc1;
11126
11127 if (uShift <= 63)
11128 {
11129 puDst->au64[0] = uSrc1.au64[0] << uShift;
11130 puDst->au64[1] = uSrc1.au64[1] << uShift;
11131 }
11132 else
11133 {
11134 puDst->au64[0] = 0;
11135 puDst->au64[1] = 0;
11136 }
11137}
11138
11139IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11140{
11141 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11142}
11143
11144IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11145{
11146 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11147}
11148
11149IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11150{
11151 RTUINT256U uSrc1 = *puSrc1;
11152
11153 if (uShift <= 63)
11154 {
11155 puDst->au64[0] = uSrc1.au64[0] << uShift;
11156 puDst->au64[1] = uSrc1.au64[1] << uShift;
11157 puDst->au64[2] = uSrc1.au64[2] << uShift;
11158 puDst->au64[3] = uSrc1.au64[3] << uShift;
11159 }
11160 else
11161 {
11162 puDst->au64[0] = 0;
11163 puDst->au64[1] = 0;
11164 puDst->au64[2] = 0;
11165 puDst->au64[3] = 0;
11166 }
11167}
11168
11169IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11170{
11171 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11172}
11173
11174IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11175{
11176 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11177}
11178
11179
11180/*
11181 * PSRLDQ / VPSRLDQ
11182 */
11183#ifdef IEM_WITHOUT_ASSEMBLY
11184
11185IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11186{
11187 RTUINT128U uSrc1 = *puDst;
11188
11189 if (uShift < 16)
11190 {
11191 int i;
11192
11193 for (i = 0; i < 16 - uShift; ++i)
11194 puDst->au8[i] = uSrc1.au8[i + uShift];
11195 for (i = 16 - uShift; i < 16; ++i)
11196 puDst->au8[i] = 0;
11197 }
11198 else
11199 {
11200 puDst->au64[0] = 0;
11201 puDst->au64[1] = 0;
11202 }
11203}
11204
11205#endif
11206
11207
11208/*
11209 * PSLLDQ / VPSLLDQ
11210 */
11211#ifdef IEM_WITHOUT_ASSEMBLY
11212
11213IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11214{
11215 RTUINT128U uSrc1 = *puDst;
11216
11217 if (uShift < 16)
11218 {
11219 int i;
11220
11221 for (i = 0; i < uShift; ++i)
11222 puDst->au8[i] = 0;
11223 for (i = uShift; i < 16; ++i)
11224 puDst->au8[i] = uSrc1.au8[i - uShift];
11225 }
11226 else
11227 {
11228 puDst->au64[0] = 0;
11229 puDst->au64[1] = 0;
11230 }
11231}
11232
11233#endif
11234
11235
11236/*
11237 * VPSRLVD
11238 */
11239IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11240{
11241 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11242 {
11243 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11244 }
11245}
11246
11247IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11248{
11249 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11250 {
11251 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11252 }
11253}
11254
11255
11256/*
11257 * VPSRAVD
11258 */
11259IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11260{
11261 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11262 {
11263 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11264 }
11265}
11266
11267IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11268{
11269 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11270 {
11271 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11272 }
11273}
11274
11275
11276/*
11277 * VPSLLVD
11278 */
11279IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11280{
11281 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11282 {
11283 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11284 }
11285}
11286
11287IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11288{
11289 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11290 {
11291 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11292 }
11293}
11294
11295
11296/*
11297 * VPSRLVQ
11298 */
11299IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11300{
11301 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11302 {
11303 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11304 }
11305}
11306
11307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11308{
11309 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11310 {
11311 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11312 }
11313}
11314
11315
11316/*
11317 * VPSLLVQ
11318 */
11319IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11320{
11321 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11322 {
11323 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11324 }
11325}
11326
11327IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11328{
11329 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11330 {
11331 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11332 }
11333}
11334
11335
11336/*
11337 * PMADDWD / VPMADDWD
11338 */
11339#ifdef IEM_WITHOUT_ASSEMBLY
11340
11341IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11342{
11343 RTUINT64U uSrc1 = { *puDst };
11344 RTUINT64U uSrc2 = { *puSrc };
11345 RTUINT64U uDst;
11346
11347 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11348 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11349 *puDst = uDst.u;
11350 RT_NOREF(pFpuState);
11351}
11352
11353
11354IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11355{
11356 RTUINT128U uSrc1 = *puDst;
11357
11358 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11359 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11360 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11361 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11362 RT_NOREF(pFpuState);
11363}
11364
11365#endif
11366
11367
11368IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11369{
11370 RTUINT64U uSrc1 = { *puDst };
11371 RTUINT64U uSrc2 = { *puSrc };
11372 RTUINT64U uDst;
11373
11374 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11375 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11376 *puDst = uDst.u;
11377 RT_NOREF(pFpuState);
11378}
11379
11380
11381IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11382{
11383 RTUINT128U uSrc1 = *puDst;
11384
11385 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11386 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11387 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11388 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11389 RT_NOREF(pFpuState);
11390}
11391
11392
11393IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11394{
11395 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11396 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11397 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11398 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11399}
11400
11401
11402IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11403{
11404 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11405 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11406 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11407 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11408 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11409 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11410 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11411 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11412}
11413
11414
11415/*
11416 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11417 */
11418#ifdef IEM_WITHOUT_ASSEMBLY
11419
11420IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11421{
11422 RTUINT64U uSrc1 = { *puDst };
11423 RTUINT64U uSrc2 = { *puSrc };
11424 RTUINT64U uDst;
11425
11426 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11427 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11428 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11429 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11430 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11431 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11432 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11433 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11434 *puDst = uDst.u;
11435 RT_NOREF(pFpuState);
11436}
11437
11438
11439IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11440{
11441 RTUINT128U uSrc1 = *puDst;
11442
11443 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11444 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11445 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11446 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11447 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11448 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11449 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11450 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11451 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11452 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11453 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11454 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11455 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11456 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11457 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11458 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11459 RT_NOREF(pFpuState);
11460}
11461
11462#endif
11463
11464
11465IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11466{
11467 RTUINT128U uSrc1 = *puDst;
11468
11469 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11470 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11471 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11472 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11473 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11474 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11475 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11476 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11477 RT_NOREF(pFpuState);
11478}
11479
11480
11481IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11482{
11483 RTUINT128U uSrc1 = *puDst;
11484
11485 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11486 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11487 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11488 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11489 RT_NOREF(pFpuState);
11490}
11491
11492
11493IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11494 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11495{
11496 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11497 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11498 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11499 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11500 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11501 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11502 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11503 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11504 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11505 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11506 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11507 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11508 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11509 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11510 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11511 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11512 RT_NOREF(pExtState);
11513}
11514
11515
11516IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11517 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11518{
11519 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11520 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11521 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11522 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11523 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11524 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11525 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11526 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11527 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11528 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11529 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11530 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11531 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11532 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11533 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11534 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11535 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11536 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11537 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11538 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11539 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11540 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11541 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11542 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11543 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11544 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11545 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11546 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11547 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11548 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11549 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11550 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11551 RT_NOREF(pExtState);
11552}
11553
11554
11555IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11556 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11557{
11558 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11559 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11560 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11561 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11562 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11563 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11564 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11565 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11566 RT_NOREF(pExtState);
11567}
11568
11569
11570IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11571 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11572{
11573 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11574 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11575 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11576 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11577 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11578 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11579 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11580 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11581 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11582 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11583 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11584 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11585 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11586 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11587 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11588 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11589 RT_NOREF(pExtState);
11590}
11591
11592
11593IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11594 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11595{
11596 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11597 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11598 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11599 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11600 RT_NOREF(pExtState);
11601}
11602
11603
11604IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11605 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11606{
11607 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11608 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11609 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11610 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11611 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11612 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11613 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11614 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11615 RT_NOREF(pExtState);
11616}
11617
11618
11619/*
11620 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11621 */
11622#ifdef IEM_WITHOUT_ASSEMBLY
11623
11624IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11625{
11626 RTUINT64U uSrc1 = { *puDst };
11627 RTUINT64U uSrc2 = { *puSrc };
11628 RTUINT64U uDst;
11629
11630 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11631 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11632 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11633 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11634 *puDst = uDst.u;
11635 RT_NOREF(pFpuState);
11636}
11637
11638
11639IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11640{
11641 RTUINT128U uSrc1 = *puDst;
11642
11643 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11644 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11645 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11646 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11647 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11648 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11649 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11650 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11651 RT_NOREF(pFpuState);
11652}
11653
11654#endif
11655
11656IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11657{
11658 RTUINT128U uSrc1 = *puDst;
11659
11660 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11661 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11662 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11663 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11664 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11665 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11666 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11667 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11668 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11669 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11670 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11671 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11672 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11673 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11674 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11675 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11676 RT_NOREF(pFpuState);
11677}
11678
11679
11680IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11681{
11682 RTUINT128U uSrc1 = *puDst;
11683
11684 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11685 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11686 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11687 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11688 RT_NOREF(pFpuState);
11689}
11690
11691
11692IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11693 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11694{
11695 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11696 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11697 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11698 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11699 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11700 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11701 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11702 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11703 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11704 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11705 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11706 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11707 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11708 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11709 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11710 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11711 RT_NOREF(pExtState);
11712}
11713
11714
11715IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11716 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11717{
11718 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11719 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11720 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11721 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11722 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11723 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11724 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11725 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11726 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11727 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11728 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11729 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11730 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11731 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11732 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11733 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11734 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11735 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11736 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11737 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11738 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11739 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11740 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11741 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11742 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11743 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11744 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11745 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11746 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11747 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11748 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11749 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11750 RT_NOREF(pExtState);
11751}
11752
11753
11754IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11755 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11756{
11757 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11758 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11759 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11760 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11761 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11762 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11763 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11764 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11765 RT_NOREF(pExtState);
11766}
11767
11768
11769IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11770 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11771{
11772 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11773 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11774 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11775 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11776 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11777 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11778 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11779 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11780 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11781 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11782 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11783 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11784 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11785 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11786 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11787 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11788 RT_NOREF(pExtState);
11789}
11790
11791
11792IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11793 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11794{
11795 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11796 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11797 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11798 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11799 RT_NOREF(pExtState);
11800}
11801
11802
11803IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11804 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11805{
11806 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11807 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11808 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11809 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11810 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11811 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11812 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11813 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11814 RT_NOREF(pExtState);
11815}
11816
11817
11818/*
11819 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11820 */
11821#ifdef IEM_WITHOUT_ASSEMBLY
11822
11823IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11824{
11825 RTUINT64U uSrc1 = { *puDst };
11826 RTUINT64U uSrc2 = { *puSrc };
11827 RTUINT64U uDst;
11828
11829 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11830 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11831 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11832 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11833 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11834 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11835 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11836 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11837 *puDst = uDst.u;
11838 RT_NOREF(pFpuState);
11839}
11840
11841
11842IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11843{
11844 RTUINT128U uSrc1 = *puDst;
11845
11846 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11847 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11848 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11849 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11850 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11851 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11852 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11853 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11854 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11855 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11856 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11857 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11858 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11859 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11860 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11861 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11862 RT_NOREF(pFpuState);
11863}
11864
11865#endif
11866
11867IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11868{
11869 RTUINT128U uSrc1 = *puDst;
11870
11871 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11872 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11873 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11874 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11875 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11876 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11877 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11878 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11879 RT_NOREF(pFpuState);
11880}
11881
11882
11883IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11884{
11885 RTUINT128U uSrc1 = *puDst;
11886
11887 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11888 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11889 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11890 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11891 RT_NOREF(pFpuState);
11892}
11893
11894
11895IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11896 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11897{
11898 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11899 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11900 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11901 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11902 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11903 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11904 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11905 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11906 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11907 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11908 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11909 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11910 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11911 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11912 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11913 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11914 RT_NOREF(pExtState);
11915}
11916
11917
11918IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11919 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11920{
11921 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11922 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11923 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11924 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11925 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11926 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11927 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11928 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11929 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11930 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11931 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11932 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11933 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11934 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11935 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11936 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11937 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11938 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11939 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11940 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11941 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11942 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11943 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11944 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11945 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11946 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11947 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11948 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11949 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11950 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11951 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11952 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11953 RT_NOREF(pExtState);
11954}
11955
11956
11957IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11958 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11959{
11960 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11961 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11962 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11963 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11964 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11965 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11966 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11967 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11968 RT_NOREF(pExtState);
11969}
11970
11971
11972IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11973 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11974{
11975 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11976 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11977 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11978 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11979 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11980 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11981 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11982 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11983 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11984 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11985 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11986 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11987 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11988 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11989 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11990 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11991 RT_NOREF(pExtState);
11992}
11993
11994
11995IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11996 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11997{
11998 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11999 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
12000 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
12001 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
12002 RT_NOREF(pExtState);
12003}
12004
12005
12006IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12007 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12008{
12009 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
12010 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
12011 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
12012 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
12013 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
12014 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
12015 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
12016 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
12017 RT_NOREF(pExtState);
12018}
12019
12020
12021/*
12022 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
12023 */
12024#ifdef IEM_WITHOUT_ASSEMBLY
12025
12026IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12027{
12028 RTUINT64U uSrc1 = { *puDst };
12029 RTUINT64U uSrc2 = { *puSrc };
12030 RTUINT64U uDst;
12031
12032 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
12033 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
12034 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
12035 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
12036 *puDst = uDst.u;
12037 RT_NOREF(pFpuState);
12038}
12039
12040
12041IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12042{
12043 RTUINT128U uSrc1 = *puDst;
12044
12045 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
12046 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
12047 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
12048 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
12049 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
12050 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
12051 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
12052 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
12053 RT_NOREF(pFpuState);
12054}
12055
12056#endif
12057
12058IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12059{
12060 RTUINT128U uSrc1 = *puDst;
12061
12062 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
12063 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
12064 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
12065 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
12066 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
12067 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
12068 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
12069 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
12070 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
12071 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
12072 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
12073 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
12074 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
12075 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
12076 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
12077 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
12078 RT_NOREF(pFpuState);
12079}
12080
12081
12082IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12083{
12084 RTUINT128U uSrc1 = *puDst;
12085
12086 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
12087 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
12088 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
12089 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
12090 RT_NOREF(pFpuState);
12091}
12092
12093
12094IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12095 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12096{
12097 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12098 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12099 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12100 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12101 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12102 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12103 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12104 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12105 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12106 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12107 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12108 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12109 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12110 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12111 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12112 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12113 RT_NOREF(pExtState);
12114}
12115
12116
12117IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12118 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12119{
12120 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12121 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12122 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12123 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12124 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12125 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12126 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12127 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12128 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12129 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12130 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12131 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12132 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12133 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12134 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12135 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12136 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
12137 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
12138 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
12139 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
12140 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
12141 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
12142 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
12143 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
12144 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
12145 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
12146 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
12147 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
12148 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
12149 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
12150 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
12151 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
12152 RT_NOREF(pExtState);
12153}
12154
12155
12156IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12157 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12158{
12159 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12160 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12161 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12162 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12163 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12164 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12165 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12166 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12167 RT_NOREF(pExtState);
12168}
12169
12170
12171IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12172 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12173{
12174 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12175 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12176 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12177 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12178 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12179 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12180 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12181 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12182 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12183 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12184 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12185 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12186 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12187 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12188 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12189 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12190 RT_NOREF(pExtState);
12191}
12192
12193
12194IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12195 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12196{
12197 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12198 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12199 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12200 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12201 RT_NOREF(pExtState);
12202}
12203
12204
12205IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12206 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12207{
12208 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12209 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12210 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12211 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12212 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12213 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12214 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12215 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12216 RT_NOREF(pExtState);
12217}
12218
12219
12220/*
12221 * PAVGB / VPAVGB / PAVGW / VPAVGW
12222 */
12223#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12224#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12225
12226#ifdef IEM_WITHOUT_ASSEMBLY
12227
12228IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12229{
12230 RTUINT64U uSrc1 = { *puDst };
12231 RTUINT64U uSrc2 = { *puSrc };
12232 RTUINT64U uDst;
12233
12234 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12235 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12236 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12237 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12238 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12239 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12240 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12241 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12242 *puDst = uDst.u;
12243}
12244
12245
12246IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12247{
12248 RTUINT128U uSrc1 = *puDst;
12249
12250 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12251 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12252 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12253 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12254 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12255 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12256 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12257 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12258 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12259 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12260 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12261 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12262 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12263 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12264 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12265 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12266}
12267
12268
12269IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12270{
12271 RTUINT64U uSrc1 = { *puDst };
12272 RTUINT64U uSrc2 = { *puSrc };
12273 RTUINT64U uDst;
12274
12275 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12276 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12277 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12278 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12279 *puDst = uDst.u;
12280}
12281
12282
12283IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12284{
12285 RTUINT128U uSrc1 = *puDst;
12286
12287 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12288 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12289 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12290 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12291 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12292 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12293 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12294 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12295}
12296
12297#endif
12298
12299IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12300{
12301 RTUINT128U uSrc1 = *puDst;
12302
12303 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12304 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12305 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12306 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12307 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12308 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12309 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12310 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12311 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12312 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12313 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12314 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12315 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12316 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12317 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12318 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12319}
12320
12321
12322IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12323{
12324 RTUINT128U uSrc1 = *puDst;
12325
12326 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12327 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12328 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12329 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12330 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12331 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12332 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12333 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12334 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12335 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12336 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12337 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12338 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12339 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12340 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12341 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12342}
12343
12344
12345IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12346{
12347 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12348 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12349 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12350 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12351 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12352 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12353 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12354 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12355 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12356 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12357 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12358 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12359 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12360 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12361 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12362 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12363}
12364
12365
12366IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12367{
12368 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12369 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12370 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12371 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12372 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12373 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12374 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12375 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12376 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12377 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12378 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12379 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12380 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12381 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12382 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12383 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12384 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12385 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12386 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12387 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12388 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12389 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12390 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12391 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12392 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12393 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12394 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12395 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12396 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12397 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12398 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12399 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12400}
12401
12402
12403IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12404{
12405 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12406 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12407 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12408 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12409 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12410 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12411 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12412 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12413}
12414
12415
12416IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12417{
12418 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12419 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12420 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12421 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12422 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12423 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12424 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12425 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12426 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12427 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12428 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12429 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12430 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12431 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12432 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12433 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12434}
12435
12436#undef PAVGB_EXEC
12437#undef PAVGW_EXEC
12438
12439
12440/*
12441 * PMOVMSKB / VPMOVMSKB
12442 */
12443#ifdef IEM_WITHOUT_ASSEMBLY
12444
12445IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12446{
12447 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12448 uint64_t const uSrc = *pu64Src;
12449 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12450 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12451 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12452 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12453 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12454 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12455 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12456 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12457}
12458
12459
12460IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12461{
12462 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12463 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12464 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12465 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12466 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12467 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12468 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12469 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12470 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12471 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12472 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12473 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12474 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12475 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12476 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12477 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12478 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12479 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12480 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12481}
12482
12483#endif
12484
12485IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12486{
12487 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12488 uint64_t const uSrc0 = puSrc->QWords.qw0;
12489 uint64_t const uSrc1 = puSrc->QWords.qw1;
12490 uint64_t const uSrc2 = puSrc->QWords.qw2;
12491 uint64_t const uSrc3 = puSrc->QWords.qw3;
12492 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12493 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12494 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12495 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12496 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12497 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12498 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12499 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12500 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12501 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12502 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12503 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12504 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12505 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12506 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12507 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12508 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12509 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12510 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12511 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12512 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12513 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12514 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12515 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12516 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12517 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12518 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12519 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12520 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12521 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12522 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12523 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12524}
12525
12526
12527/*
12528 * [V]PSHUFB
12529 */
12530
12531IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12532{
12533 RTUINT64U const uSrc = { *puSrc };
12534 RTUINT64U const uDstIn = { *puDst };
12535 ASMCompilerBarrier();
12536 RTUINT64U uDstOut = { 0 };
12537 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12538 {
12539 uint8_t idxSrc = uSrc.au8[iByte];
12540 if (!(idxSrc & 0x80))
12541 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12542 }
12543 *puDst = uDstOut.u;
12544 RT_NOREF(pFpuState);
12545}
12546
12547
12548IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12549{
12550 RTUINT128U const uSrc = *puSrc;
12551 RTUINT128U const uDstIn = *puDst;
12552 ASMCompilerBarrier();
12553 puDst->au64[0] = 0;
12554 puDst->au64[1] = 0;
12555 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12556 {
12557 uint8_t idxSrc = uSrc.au8[iByte];
12558 if (!(idxSrc & 0x80))
12559 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12560 }
12561 RT_NOREF(pFpuState);
12562}
12563
12564
12565IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12566 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12567{
12568 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12569 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12570 ASMCompilerBarrier();
12571 puDst->au64[0] = 0;
12572 puDst->au64[1] = 0;
12573 for (unsigned iByte = 0; iByte < 16; iByte++)
12574 {
12575 uint8_t idxSrc = uSrc2.au8[iByte];
12576 if (!(idxSrc & 0x80))
12577 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12578 }
12579 RT_NOREF(pExtState);
12580}
12581
12582
12583IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12584 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12585{
12586 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12587 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12588 ASMCompilerBarrier();
12589 puDst->au64[0] = 0;
12590 puDst->au64[1] = 0;
12591 puDst->au64[2] = 0;
12592 puDst->au64[3] = 0;
12593 for (unsigned iByte = 0; iByte < 16; iByte++)
12594 {
12595 uint8_t idxSrc = uSrc2.au8[iByte];
12596 if (!(idxSrc & 0x80))
12597 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12598 }
12599 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12600 {
12601 uint8_t idxSrc = uSrc2.au8[iByte];
12602 if (!(idxSrc & 0x80))
12603 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12604 }
12605 RT_NOREF(pExtState);
12606}
12607
12608
12609/*
12610 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12611 */
12612#ifdef IEM_WITHOUT_ASSEMBLY
12613
12614IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12615{
12616 uint64_t const uSrc = *puSrc;
12617 ASMCompilerBarrier();
12618 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12619 uSrc >> (((bEvil >> 2) & 3) * 16),
12620 uSrc >> (((bEvil >> 4) & 3) * 16),
12621 uSrc >> (((bEvil >> 6) & 3) * 16));
12622}
12623
12624
12625IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12626{
12627 puDst->QWords.qw0 = puSrc->QWords.qw0;
12628 uint64_t const uSrc = puSrc->QWords.qw1;
12629 ASMCompilerBarrier();
12630 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12631 uSrc >> (((bEvil >> 2) & 3) * 16),
12632 uSrc >> (((bEvil >> 4) & 3) * 16),
12633 uSrc >> (((bEvil >> 6) & 3) * 16));
12634}
12635
12636#endif
12637
12638IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12639{
12640 puDst->QWords.qw0 = puSrc->QWords.qw0;
12641 uint64_t const uSrc1 = puSrc->QWords.qw1;
12642 puDst->QWords.qw2 = puSrc->QWords.qw2;
12643 uint64_t const uSrc3 = puSrc->QWords.qw3;
12644 ASMCompilerBarrier();
12645 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12646 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12647 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12648 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12649 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12650 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12651 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12652 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12653}
12654
12655#ifdef IEM_WITHOUT_ASSEMBLY
12656IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12657{
12658 puDst->QWords.qw1 = puSrc->QWords.qw1;
12659 uint64_t const uSrc = puSrc->QWords.qw0;
12660 ASMCompilerBarrier();
12661 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12662 uSrc >> (((bEvil >> 2) & 3) * 16),
12663 uSrc >> (((bEvil >> 4) & 3) * 16),
12664 uSrc >> (((bEvil >> 6) & 3) * 16));
12665
12666}
12667#endif
12668
12669
12670IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12671{
12672 puDst->QWords.qw3 = puSrc->QWords.qw3;
12673 uint64_t const uSrc2 = puSrc->QWords.qw2;
12674 puDst->QWords.qw1 = puSrc->QWords.qw1;
12675 uint64_t const uSrc0 = puSrc->QWords.qw0;
12676 ASMCompilerBarrier();
12677 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12678 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12679 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12680 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12681 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12682 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12683 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12684 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12685
12686}
12687
12688
12689#ifdef IEM_WITHOUT_ASSEMBLY
12690IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12691{
12692 RTUINT128U const uSrc = *puSrc;
12693 ASMCompilerBarrier();
12694 puDst->au32[0] = uSrc.au32[bEvil & 3];
12695 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12696 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12697 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12698}
12699#endif
12700
12701
12702IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12703{
12704 RTUINT256U const uSrc = *puSrc;
12705 ASMCompilerBarrier();
12706 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12707 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12708 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12709 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12710 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12711 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12712 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12713 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12714}
12715
12716
12717/*
12718 * PUNPCKHBW - high bytes -> words
12719 */
12720#ifdef IEM_WITHOUT_ASSEMBLY
12721
12722IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12723{
12724 RTUINT64U const uSrc2 = { *puSrc };
12725 RTUINT64U const uSrc1 = { *puDst };
12726 ASMCompilerBarrier();
12727 RTUINT64U uDstOut;
12728 uDstOut.au8[0] = uSrc1.au8[4];
12729 uDstOut.au8[1] = uSrc2.au8[4];
12730 uDstOut.au8[2] = uSrc1.au8[5];
12731 uDstOut.au8[3] = uSrc2.au8[5];
12732 uDstOut.au8[4] = uSrc1.au8[6];
12733 uDstOut.au8[5] = uSrc2.au8[6];
12734 uDstOut.au8[6] = uSrc1.au8[7];
12735 uDstOut.au8[7] = uSrc2.au8[7];
12736 *puDst = uDstOut.u;
12737}
12738
12739
12740IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12741{
12742 RTUINT128U const uSrc2 = *puSrc;
12743 RTUINT128U const uSrc1 = *puDst;
12744 ASMCompilerBarrier();
12745 RTUINT128U uDstOut;
12746 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12747 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12748 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12749 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12750 uDstOut.au8[ 4] = uSrc1.au8[10];
12751 uDstOut.au8[ 5] = uSrc2.au8[10];
12752 uDstOut.au8[ 6] = uSrc1.au8[11];
12753 uDstOut.au8[ 7] = uSrc2.au8[11];
12754 uDstOut.au8[ 8] = uSrc1.au8[12];
12755 uDstOut.au8[ 9] = uSrc2.au8[12];
12756 uDstOut.au8[10] = uSrc1.au8[13];
12757 uDstOut.au8[11] = uSrc2.au8[13];
12758 uDstOut.au8[12] = uSrc1.au8[14];
12759 uDstOut.au8[13] = uSrc2.au8[14];
12760 uDstOut.au8[14] = uSrc1.au8[15];
12761 uDstOut.au8[15] = uSrc2.au8[15];
12762 *puDst = uDstOut;
12763}
12764
12765#endif
12766
12767IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12768{
12769 RTUINT128U const uSrc2 = *puSrc2;
12770 RTUINT128U const uSrc1 = *puSrc1;
12771 ASMCompilerBarrier();
12772 RTUINT128U uDstOut;
12773 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12774 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12775 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12776 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12777 uDstOut.au8[ 4] = uSrc1.au8[10];
12778 uDstOut.au8[ 5] = uSrc2.au8[10];
12779 uDstOut.au8[ 6] = uSrc1.au8[11];
12780 uDstOut.au8[ 7] = uSrc2.au8[11];
12781 uDstOut.au8[ 8] = uSrc1.au8[12];
12782 uDstOut.au8[ 9] = uSrc2.au8[12];
12783 uDstOut.au8[10] = uSrc1.au8[13];
12784 uDstOut.au8[11] = uSrc2.au8[13];
12785 uDstOut.au8[12] = uSrc1.au8[14];
12786 uDstOut.au8[13] = uSrc2.au8[14];
12787 uDstOut.au8[14] = uSrc1.au8[15];
12788 uDstOut.au8[15] = uSrc2.au8[15];
12789 *puDst = uDstOut;
12790}
12791
12792
12793IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12794{
12795 RTUINT256U const uSrc2 = *puSrc2;
12796 RTUINT256U const uSrc1 = *puSrc1;
12797 ASMCompilerBarrier();
12798 RTUINT256U uDstOut;
12799 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12800 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12801 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12802 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12803 uDstOut.au8[ 4] = uSrc1.au8[10];
12804 uDstOut.au8[ 5] = uSrc2.au8[10];
12805 uDstOut.au8[ 6] = uSrc1.au8[11];
12806 uDstOut.au8[ 7] = uSrc2.au8[11];
12807 uDstOut.au8[ 8] = uSrc1.au8[12];
12808 uDstOut.au8[ 9] = uSrc2.au8[12];
12809 uDstOut.au8[10] = uSrc1.au8[13];
12810 uDstOut.au8[11] = uSrc2.au8[13];
12811 uDstOut.au8[12] = uSrc1.au8[14];
12812 uDstOut.au8[13] = uSrc2.au8[14];
12813 uDstOut.au8[14] = uSrc1.au8[15];
12814 uDstOut.au8[15] = uSrc2.au8[15];
12815 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12816 uDstOut.au8[16] = uSrc1.au8[24];
12817 uDstOut.au8[17] = uSrc2.au8[24];
12818 uDstOut.au8[18] = uSrc1.au8[25];
12819 uDstOut.au8[19] = uSrc2.au8[25];
12820 uDstOut.au8[20] = uSrc1.au8[26];
12821 uDstOut.au8[21] = uSrc2.au8[26];
12822 uDstOut.au8[22] = uSrc1.au8[27];
12823 uDstOut.au8[23] = uSrc2.au8[27];
12824 uDstOut.au8[24] = uSrc1.au8[28];
12825 uDstOut.au8[25] = uSrc2.au8[28];
12826 uDstOut.au8[26] = uSrc1.au8[29];
12827 uDstOut.au8[27] = uSrc2.au8[29];
12828 uDstOut.au8[28] = uSrc1.au8[30];
12829 uDstOut.au8[29] = uSrc2.au8[30];
12830 uDstOut.au8[30] = uSrc1.au8[31];
12831 uDstOut.au8[31] = uSrc2.au8[31];
12832 *puDst = uDstOut;
12833}
12834
12835
12836/*
12837 * PUNPCKHBW - high words -> dwords
12838 */
12839#ifdef IEM_WITHOUT_ASSEMBLY
12840
12841IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12842{
12843 RTUINT64U const uSrc2 = { *puSrc };
12844 RTUINT64U const uSrc1 = { *puDst };
12845 ASMCompilerBarrier();
12846 RTUINT64U uDstOut;
12847 uDstOut.au16[0] = uSrc1.au16[2];
12848 uDstOut.au16[1] = uSrc2.au16[2];
12849 uDstOut.au16[2] = uSrc1.au16[3];
12850 uDstOut.au16[3] = uSrc2.au16[3];
12851 *puDst = uDstOut.u;
12852}
12853
12854
12855IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12856{
12857 RTUINT128U const uSrc2 = *puSrc;
12858 RTUINT128U const uSrc1 = *puDst;
12859 ASMCompilerBarrier();
12860 RTUINT128U uDstOut;
12861 uDstOut.au16[0] = uSrc1.au16[4];
12862 uDstOut.au16[1] = uSrc2.au16[4];
12863 uDstOut.au16[2] = uSrc1.au16[5];
12864 uDstOut.au16[3] = uSrc2.au16[5];
12865 uDstOut.au16[4] = uSrc1.au16[6];
12866 uDstOut.au16[5] = uSrc2.au16[6];
12867 uDstOut.au16[6] = uSrc1.au16[7];
12868 uDstOut.au16[7] = uSrc2.au16[7];
12869 *puDst = uDstOut;
12870}
12871
12872#endif
12873
12874IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12875{
12876 RTUINT128U const uSrc2 = *puSrc2;
12877 RTUINT128U const uSrc1 = *puSrc1;
12878 ASMCompilerBarrier();
12879 RTUINT128U uDstOut;
12880 uDstOut.au16[0] = uSrc1.au16[4];
12881 uDstOut.au16[1] = uSrc2.au16[4];
12882 uDstOut.au16[2] = uSrc1.au16[5];
12883 uDstOut.au16[3] = uSrc2.au16[5];
12884 uDstOut.au16[4] = uSrc1.au16[6];
12885 uDstOut.au16[5] = uSrc2.au16[6];
12886 uDstOut.au16[6] = uSrc1.au16[7];
12887 uDstOut.au16[7] = uSrc2.au16[7];
12888 *puDst = uDstOut;
12889}
12890
12891
12892IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12893{
12894 RTUINT256U const uSrc2 = *puSrc2;
12895 RTUINT256U const uSrc1 = *puSrc1;
12896 ASMCompilerBarrier();
12897 RTUINT256U uDstOut;
12898 uDstOut.au16[0] = uSrc1.au16[4];
12899 uDstOut.au16[1] = uSrc2.au16[4];
12900 uDstOut.au16[2] = uSrc1.au16[5];
12901 uDstOut.au16[3] = uSrc2.au16[5];
12902 uDstOut.au16[4] = uSrc1.au16[6];
12903 uDstOut.au16[5] = uSrc2.au16[6];
12904 uDstOut.au16[6] = uSrc1.au16[7];
12905 uDstOut.au16[7] = uSrc2.au16[7];
12906
12907 uDstOut.au16[8] = uSrc1.au16[12];
12908 uDstOut.au16[9] = uSrc2.au16[12];
12909 uDstOut.au16[10] = uSrc1.au16[13];
12910 uDstOut.au16[11] = uSrc2.au16[13];
12911 uDstOut.au16[12] = uSrc1.au16[14];
12912 uDstOut.au16[13] = uSrc2.au16[14];
12913 uDstOut.au16[14] = uSrc1.au16[15];
12914 uDstOut.au16[15] = uSrc2.au16[15];
12915 *puDst = uDstOut;
12916}
12917
12918
12919/*
12920 * PUNPCKHBW - high dwords -> qword(s)
12921 */
12922#ifdef IEM_WITHOUT_ASSEMBLY
12923
12924IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12925{
12926 RTUINT64U const uSrc2 = { *puSrc };
12927 RTUINT64U const uSrc1 = { *puDst };
12928 ASMCompilerBarrier();
12929 RTUINT64U uDstOut;
12930 uDstOut.au32[0] = uSrc1.au32[1];
12931 uDstOut.au32[1] = uSrc2.au32[1];
12932 *puDst = uDstOut.u;
12933}
12934
12935
12936IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12937{
12938 RTUINT128U const uSrc2 = *puSrc;
12939 RTUINT128U const uSrc1 = *puDst;
12940 ASMCompilerBarrier();
12941 RTUINT128U uDstOut;
12942 uDstOut.au32[0] = uSrc1.au32[2];
12943 uDstOut.au32[1] = uSrc2.au32[2];
12944 uDstOut.au32[2] = uSrc1.au32[3];
12945 uDstOut.au32[3] = uSrc2.au32[3];
12946 *puDst = uDstOut;
12947}
12948
12949#endif
12950
12951IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12952{
12953 RTUINT128U const uSrc2 = *puSrc2;
12954 RTUINT128U const uSrc1 = *puSrc1;
12955 ASMCompilerBarrier();
12956 RTUINT128U uDstOut;
12957 uDstOut.au32[0] = uSrc1.au32[2];
12958 uDstOut.au32[1] = uSrc2.au32[2];
12959 uDstOut.au32[2] = uSrc1.au32[3];
12960 uDstOut.au32[3] = uSrc2.au32[3];
12961 *puDst = uDstOut;
12962}
12963
12964
12965IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12966{
12967 RTUINT256U const uSrc2 = *puSrc2;
12968 RTUINT256U const uSrc1 = *puSrc1;
12969 ASMCompilerBarrier();
12970 RTUINT256U uDstOut;
12971 uDstOut.au32[0] = uSrc1.au32[2];
12972 uDstOut.au32[1] = uSrc2.au32[2];
12973 uDstOut.au32[2] = uSrc1.au32[3];
12974 uDstOut.au32[3] = uSrc2.au32[3];
12975
12976 uDstOut.au32[4] = uSrc1.au32[6];
12977 uDstOut.au32[5] = uSrc2.au32[6];
12978 uDstOut.au32[6] = uSrc1.au32[7];
12979 uDstOut.au32[7] = uSrc2.au32[7];
12980 *puDst = uDstOut;
12981}
12982
12983
12984/*
12985 * PUNPCKHQDQ -> High qwords -> double qword(s).
12986 */
12987#ifdef IEM_WITHOUT_ASSEMBLY
12988IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12989{
12990 RTUINT128U const uSrc2 = *puSrc;
12991 RTUINT128U const uSrc1 = *puDst;
12992 ASMCompilerBarrier();
12993 RTUINT128U uDstOut;
12994 uDstOut.au64[0] = uSrc1.au64[1];
12995 uDstOut.au64[1] = uSrc2.au64[1];
12996 *puDst = uDstOut;
12997}
12998#endif
12999
13000
13001IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13002{
13003 RTUINT128U const uSrc2 = *puSrc2;
13004 RTUINT128U const uSrc1 = *puSrc1;
13005 ASMCompilerBarrier();
13006 RTUINT128U uDstOut;
13007 uDstOut.au64[0] = uSrc1.au64[1];
13008 uDstOut.au64[1] = uSrc2.au64[1];
13009 *puDst = uDstOut;
13010}
13011
13012
13013IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13014{
13015 RTUINT256U const uSrc2 = *puSrc2;
13016 RTUINT256U const uSrc1 = *puSrc1;
13017 ASMCompilerBarrier();
13018 RTUINT256U uDstOut;
13019 uDstOut.au64[0] = uSrc1.au64[1];
13020 uDstOut.au64[1] = uSrc2.au64[1];
13021
13022 uDstOut.au64[2] = uSrc1.au64[3];
13023 uDstOut.au64[3] = uSrc2.au64[3];
13024 *puDst = uDstOut;
13025}
13026
13027
13028/*
13029 * PUNPCKLBW - low bytes -> words
13030 */
13031#ifdef IEM_WITHOUT_ASSEMBLY
13032
13033IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13034{
13035 RTUINT64U const uSrc2 = { *puSrc };
13036 RTUINT64U const uSrc1 = { *puDst };
13037 ASMCompilerBarrier();
13038 RTUINT64U uDstOut;
13039 uDstOut.au8[0] = uSrc1.au8[0];
13040 uDstOut.au8[1] = uSrc2.au8[0];
13041 uDstOut.au8[2] = uSrc1.au8[1];
13042 uDstOut.au8[3] = uSrc2.au8[1];
13043 uDstOut.au8[4] = uSrc1.au8[2];
13044 uDstOut.au8[5] = uSrc2.au8[2];
13045 uDstOut.au8[6] = uSrc1.au8[3];
13046 uDstOut.au8[7] = uSrc2.au8[3];
13047 *puDst = uDstOut.u;
13048}
13049
13050
13051IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13052{
13053 RTUINT128U const uSrc2 = *puSrc;
13054 RTUINT128U const uSrc1 = *puDst;
13055 ASMCompilerBarrier();
13056 RTUINT128U uDstOut;
13057 uDstOut.au8[ 0] = uSrc1.au8[0];
13058 uDstOut.au8[ 1] = uSrc2.au8[0];
13059 uDstOut.au8[ 2] = uSrc1.au8[1];
13060 uDstOut.au8[ 3] = uSrc2.au8[1];
13061 uDstOut.au8[ 4] = uSrc1.au8[2];
13062 uDstOut.au8[ 5] = uSrc2.au8[2];
13063 uDstOut.au8[ 6] = uSrc1.au8[3];
13064 uDstOut.au8[ 7] = uSrc2.au8[3];
13065 uDstOut.au8[ 8] = uSrc1.au8[4];
13066 uDstOut.au8[ 9] = uSrc2.au8[4];
13067 uDstOut.au8[10] = uSrc1.au8[5];
13068 uDstOut.au8[11] = uSrc2.au8[5];
13069 uDstOut.au8[12] = uSrc1.au8[6];
13070 uDstOut.au8[13] = uSrc2.au8[6];
13071 uDstOut.au8[14] = uSrc1.au8[7];
13072 uDstOut.au8[15] = uSrc2.au8[7];
13073 *puDst = uDstOut;
13074}
13075
13076#endif
13077
13078IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13079{
13080 RTUINT128U const uSrc2 = *puSrc2;
13081 RTUINT128U const uSrc1 = *puSrc1;
13082 ASMCompilerBarrier();
13083 RTUINT128U uDstOut;
13084 uDstOut.au8[ 0] = uSrc1.au8[0];
13085 uDstOut.au8[ 1] = uSrc2.au8[0];
13086 uDstOut.au8[ 2] = uSrc1.au8[1];
13087 uDstOut.au8[ 3] = uSrc2.au8[1];
13088 uDstOut.au8[ 4] = uSrc1.au8[2];
13089 uDstOut.au8[ 5] = uSrc2.au8[2];
13090 uDstOut.au8[ 6] = uSrc1.au8[3];
13091 uDstOut.au8[ 7] = uSrc2.au8[3];
13092 uDstOut.au8[ 8] = uSrc1.au8[4];
13093 uDstOut.au8[ 9] = uSrc2.au8[4];
13094 uDstOut.au8[10] = uSrc1.au8[5];
13095 uDstOut.au8[11] = uSrc2.au8[5];
13096 uDstOut.au8[12] = uSrc1.au8[6];
13097 uDstOut.au8[13] = uSrc2.au8[6];
13098 uDstOut.au8[14] = uSrc1.au8[7];
13099 uDstOut.au8[15] = uSrc2.au8[7];
13100 *puDst = uDstOut;
13101}
13102
13103
13104IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13105{
13106 RTUINT256U const uSrc2 = *puSrc2;
13107 RTUINT256U const uSrc1 = *puSrc1;
13108 ASMCompilerBarrier();
13109 RTUINT256U uDstOut;
13110 uDstOut.au8[ 0] = uSrc1.au8[0];
13111 uDstOut.au8[ 1] = uSrc2.au8[0];
13112 uDstOut.au8[ 2] = uSrc1.au8[1];
13113 uDstOut.au8[ 3] = uSrc2.au8[1];
13114 uDstOut.au8[ 4] = uSrc1.au8[2];
13115 uDstOut.au8[ 5] = uSrc2.au8[2];
13116 uDstOut.au8[ 6] = uSrc1.au8[3];
13117 uDstOut.au8[ 7] = uSrc2.au8[3];
13118 uDstOut.au8[ 8] = uSrc1.au8[4];
13119 uDstOut.au8[ 9] = uSrc2.au8[4];
13120 uDstOut.au8[10] = uSrc1.au8[5];
13121 uDstOut.au8[11] = uSrc2.au8[5];
13122 uDstOut.au8[12] = uSrc1.au8[6];
13123 uDstOut.au8[13] = uSrc2.au8[6];
13124 uDstOut.au8[14] = uSrc1.au8[7];
13125 uDstOut.au8[15] = uSrc2.au8[7];
13126 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
13127 uDstOut.au8[16] = uSrc1.au8[16];
13128 uDstOut.au8[17] = uSrc2.au8[16];
13129 uDstOut.au8[18] = uSrc1.au8[17];
13130 uDstOut.au8[19] = uSrc2.au8[17];
13131 uDstOut.au8[20] = uSrc1.au8[18];
13132 uDstOut.au8[21] = uSrc2.au8[18];
13133 uDstOut.au8[22] = uSrc1.au8[19];
13134 uDstOut.au8[23] = uSrc2.au8[19];
13135 uDstOut.au8[24] = uSrc1.au8[20];
13136 uDstOut.au8[25] = uSrc2.au8[20];
13137 uDstOut.au8[26] = uSrc1.au8[21];
13138 uDstOut.au8[27] = uSrc2.au8[21];
13139 uDstOut.au8[28] = uSrc1.au8[22];
13140 uDstOut.au8[29] = uSrc2.au8[22];
13141 uDstOut.au8[30] = uSrc1.au8[23];
13142 uDstOut.au8[31] = uSrc2.au8[23];
13143 *puDst = uDstOut;
13144}
13145
13146
13147/*
13148 * PUNPCKLBW - low words -> dwords
13149 */
13150#ifdef IEM_WITHOUT_ASSEMBLY
13151
13152IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
13153{
13154 RTUINT64U const uSrc2 = { *puSrc };
13155 RTUINT64U const uSrc1 = { *puDst };
13156 ASMCompilerBarrier();
13157 RTUINT64U uDstOut;
13158 uDstOut.au16[0] = uSrc1.au16[0];
13159 uDstOut.au16[1] = uSrc2.au16[0];
13160 uDstOut.au16[2] = uSrc1.au16[1];
13161 uDstOut.au16[3] = uSrc2.au16[1];
13162 *puDst = uDstOut.u;
13163}
13164
13165
13166IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13167{
13168 RTUINT128U const uSrc2 = *puSrc;
13169 RTUINT128U const uSrc1 = *puDst;
13170 ASMCompilerBarrier();
13171 RTUINT128U uDstOut;
13172 uDstOut.au16[0] = uSrc1.au16[0];
13173 uDstOut.au16[1] = uSrc2.au16[0];
13174 uDstOut.au16[2] = uSrc1.au16[1];
13175 uDstOut.au16[3] = uSrc2.au16[1];
13176 uDstOut.au16[4] = uSrc1.au16[2];
13177 uDstOut.au16[5] = uSrc2.au16[2];
13178 uDstOut.au16[6] = uSrc1.au16[3];
13179 uDstOut.au16[7] = uSrc2.au16[3];
13180 *puDst = uDstOut;
13181}
13182
13183#endif
13184
13185IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13186{
13187 RTUINT128U const uSrc2 = *puSrc2;
13188 RTUINT128U const uSrc1 = *puSrc1;
13189 ASMCompilerBarrier();
13190 RTUINT128U uDstOut;
13191 uDstOut.au16[0] = uSrc1.au16[0];
13192 uDstOut.au16[1] = uSrc2.au16[0];
13193 uDstOut.au16[2] = uSrc1.au16[1];
13194 uDstOut.au16[3] = uSrc2.au16[1];
13195 uDstOut.au16[4] = uSrc1.au16[2];
13196 uDstOut.au16[5] = uSrc2.au16[2];
13197 uDstOut.au16[6] = uSrc1.au16[3];
13198 uDstOut.au16[7] = uSrc2.au16[3];
13199 *puDst = uDstOut;
13200}
13201
13202
13203IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13204{
13205 RTUINT256U const uSrc2 = *puSrc2;
13206 RTUINT256U const uSrc1 = *puSrc1;
13207 ASMCompilerBarrier();
13208 RTUINT256U uDstOut;
13209 uDstOut.au16[0] = uSrc1.au16[0];
13210 uDstOut.au16[1] = uSrc2.au16[0];
13211 uDstOut.au16[2] = uSrc1.au16[1];
13212 uDstOut.au16[3] = uSrc2.au16[1];
13213 uDstOut.au16[4] = uSrc1.au16[2];
13214 uDstOut.au16[5] = uSrc2.au16[2];
13215 uDstOut.au16[6] = uSrc1.au16[3];
13216 uDstOut.au16[7] = uSrc2.au16[3];
13217
13218 uDstOut.au16[8] = uSrc1.au16[8];
13219 uDstOut.au16[9] = uSrc2.au16[8];
13220 uDstOut.au16[10] = uSrc1.au16[9];
13221 uDstOut.au16[11] = uSrc2.au16[9];
13222 uDstOut.au16[12] = uSrc1.au16[10];
13223 uDstOut.au16[13] = uSrc2.au16[10];
13224 uDstOut.au16[14] = uSrc1.au16[11];
13225 uDstOut.au16[15] = uSrc2.au16[11];
13226 *puDst = uDstOut;
13227}
13228
13229
13230/*
13231 * PUNPCKLBW - low dwords -> qword(s)
13232 */
13233#ifdef IEM_WITHOUT_ASSEMBLY
13234
13235IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13236{
13237 RTUINT64U const uSrc2 = { *puSrc };
13238 RTUINT64U const uSrc1 = { *puDst };
13239 ASMCompilerBarrier();
13240 RTUINT64U uDstOut;
13241 uDstOut.au32[0] = uSrc1.au32[0];
13242 uDstOut.au32[1] = uSrc2.au32[0];
13243 *puDst = uDstOut.u;
13244}
13245
13246
13247IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13248{
13249 RTUINT128U const uSrc2 = *puSrc;
13250 RTUINT128U const uSrc1 = *puDst;
13251 ASMCompilerBarrier();
13252 RTUINT128U uDstOut;
13253 uDstOut.au32[0] = uSrc1.au32[0];
13254 uDstOut.au32[1] = uSrc2.au32[0];
13255 uDstOut.au32[2] = uSrc1.au32[1];
13256 uDstOut.au32[3] = uSrc2.au32[1];
13257 *puDst = uDstOut;
13258}
13259
13260#endif
13261
13262IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13263{
13264 RTUINT128U const uSrc2 = *puSrc2;
13265 RTUINT128U const uSrc1 = *puSrc1;
13266 ASMCompilerBarrier();
13267 RTUINT128U uDstOut;
13268 uDstOut.au32[0] = uSrc1.au32[0];
13269 uDstOut.au32[1] = uSrc2.au32[0];
13270 uDstOut.au32[2] = uSrc1.au32[1];
13271 uDstOut.au32[3] = uSrc2.au32[1];
13272 *puDst = uDstOut;
13273}
13274
13275
13276IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13277{
13278 RTUINT256U const uSrc2 = *puSrc2;
13279 RTUINT256U const uSrc1 = *puSrc1;
13280 ASMCompilerBarrier();
13281 RTUINT256U uDstOut;
13282 uDstOut.au32[0] = uSrc1.au32[0];
13283 uDstOut.au32[1] = uSrc2.au32[0];
13284 uDstOut.au32[2] = uSrc1.au32[1];
13285 uDstOut.au32[3] = uSrc2.au32[1];
13286
13287 uDstOut.au32[4] = uSrc1.au32[4];
13288 uDstOut.au32[5] = uSrc2.au32[4];
13289 uDstOut.au32[6] = uSrc1.au32[5];
13290 uDstOut.au32[7] = uSrc2.au32[5];
13291 *puDst = uDstOut;
13292}
13293
13294
13295/*
13296 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13297 */
13298#ifdef IEM_WITHOUT_ASSEMBLY
13299IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13300{
13301 RTUINT128U const uSrc2 = *puSrc;
13302 RTUINT128U const uSrc1 = *puDst;
13303 ASMCompilerBarrier();
13304 RTUINT128U uDstOut;
13305 uDstOut.au64[0] = uSrc1.au64[0];
13306 uDstOut.au64[1] = uSrc2.au64[0];
13307 *puDst = uDstOut;
13308}
13309#endif
13310
13311
13312IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13313{
13314 RTUINT128U const uSrc2 = *puSrc2;
13315 RTUINT128U const uSrc1 = *puSrc1;
13316 ASMCompilerBarrier();
13317 RTUINT128U uDstOut;
13318 uDstOut.au64[0] = uSrc1.au64[0];
13319 uDstOut.au64[1] = uSrc2.au64[0];
13320 *puDst = uDstOut;
13321}
13322
13323
13324IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13325{
13326 RTUINT256U const uSrc2 = *puSrc2;
13327 RTUINT256U const uSrc1 = *puSrc1;
13328 ASMCompilerBarrier();
13329 RTUINT256U uDstOut;
13330 uDstOut.au64[0] = uSrc1.au64[0];
13331 uDstOut.au64[1] = uSrc2.au64[0];
13332
13333 uDstOut.au64[2] = uSrc1.au64[2];
13334 uDstOut.au64[3] = uSrc2.au64[2];
13335 *puDst = uDstOut;
13336}
13337
13338
13339/*
13340 * PACKSSWB - signed words -> signed bytes
13341 */
13342
13343#ifdef IEM_WITHOUT_ASSEMBLY
13344
13345IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13346{
13347 RTUINT64U const uSrc2 = { *puSrc };
13348 RTUINT64U const uSrc1 = { *puDst };
13349 ASMCompilerBarrier();
13350 RTUINT64U uDstOut;
13351 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13352 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13353 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13354 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13355 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13356 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13357 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13358 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13359 *puDst = uDstOut.u;
13360}
13361
13362
13363IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13364{
13365 RTUINT128U const uSrc2 = *puSrc;
13366 RTUINT128U const uSrc1 = *puDst;
13367 ASMCompilerBarrier();
13368 RTUINT128U uDstOut;
13369 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13370 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13371 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13372 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13373 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13374 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13375 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13376 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13377 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13378 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13379 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13380 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13381 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13382 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13383 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13384 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13385 *puDst = uDstOut;
13386}
13387
13388#endif
13389
13390IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13391{
13392 RTUINT128U const uSrc2 = *puSrc2;
13393 RTUINT128U const uSrc1 = *puSrc1;
13394 ASMCompilerBarrier();
13395 RTUINT128U uDstOut;
13396 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13397 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13398 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13399 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13400 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13401 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13402 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13403 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13404 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13405 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13406 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13407 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13408 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13409 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13410 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13411 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13412 *puDst = uDstOut;
13413}
13414
13415
13416IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13417{
13418 RTUINT256U const uSrc2 = *puSrc2;
13419 RTUINT256U const uSrc1 = *puSrc1;
13420 ASMCompilerBarrier();
13421 RTUINT256U uDstOut;
13422 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13423 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13424 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13425 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13426 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13427 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13428 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13429 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13430 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13431 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13432 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13433 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13434 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13435 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13436 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13437 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13438
13439 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13440 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13441 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13442 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13443 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13444 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13445 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13446 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13447 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13448 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13449 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13450 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13451 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13452 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13453 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13454 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13455 *puDst = uDstOut;
13456}
13457
13458
13459/*
13460 * PACKUSWB - signed words -> unsigned bytes
13461 */
13462#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13463 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13464 ? (uint8_t)(a_iWord) \
13465 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13466
13467#ifdef IEM_WITHOUT_ASSEMBLY
13468
13469IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13470{
13471 RTUINT64U const uSrc2 = { *puSrc };
13472 RTUINT64U const uSrc1 = { *puDst };
13473 ASMCompilerBarrier();
13474 RTUINT64U uDstOut;
13475 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13476 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13477 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13478 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13479 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13480 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13481 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13482 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13483 *puDst = uDstOut.u;
13484}
13485
13486
13487IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13488{
13489 RTUINT128U const uSrc2 = *puSrc;
13490 RTUINT128U const uSrc1 = *puDst;
13491 ASMCompilerBarrier();
13492 RTUINT128U uDstOut;
13493 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13494 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13495 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13496 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13497 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13498 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13499 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13500 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13501 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13502 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13503 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13504 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13505 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13506 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13507 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13508 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13509 *puDst = uDstOut;
13510}
13511
13512#endif
13513
13514IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13515{
13516 RTUINT128U const uSrc2 = *puSrc2;
13517 RTUINT128U const uSrc1 = *puSrc1;
13518 ASMCompilerBarrier();
13519 RTUINT128U uDstOut;
13520 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13521 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13522 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13523 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13524 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13525 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13526 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13527 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13528 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13529 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13530 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13531 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13532 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13533 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13534 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13535 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13536 *puDst = uDstOut;
13537}
13538
13539
13540IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13541{
13542 RTUINT256U const uSrc2 = *puSrc2;
13543 RTUINT256U const uSrc1 = *puSrc1;
13544 ASMCompilerBarrier();
13545 RTUINT256U uDstOut;
13546 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13547 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13548 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13549 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13550 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13551 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13552 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13553 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13554 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13555 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13556 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13557 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13558 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13559 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13560 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13561 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13562
13563 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13564 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13565 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13566 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13567 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13568 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13569 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13570 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13571 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13572 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13573 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13574 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13575 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13576 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13577 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13578 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13579 *puDst = uDstOut;
13580}
13581
13582
13583/*
13584 * PACKSSDW - signed dwords -> signed words
13585 */
13586
13587#ifdef IEM_WITHOUT_ASSEMBLY
13588
13589IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13590{
13591 RTUINT64U const uSrc2 = { *puSrc };
13592 RTUINT64U const uSrc1 = { *puDst };
13593 ASMCompilerBarrier();
13594 RTUINT64U uDstOut;
13595 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13596 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13597 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13598 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13599 *puDst = uDstOut.u;
13600}
13601
13602
13603IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13604{
13605 RTUINT128U const uSrc2 = *puSrc;
13606 RTUINT128U const uSrc1 = *puDst;
13607 ASMCompilerBarrier();
13608 RTUINT128U uDstOut;
13609 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13610 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13611 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13612 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13613 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13614 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13615 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13616 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13617 *puDst = uDstOut;
13618}
13619
13620#endif
13621
13622IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13623{
13624 RTUINT128U const uSrc2 = *puSrc2;
13625 RTUINT128U const uSrc1 = *puSrc1;
13626 ASMCompilerBarrier();
13627 RTUINT128U uDstOut;
13628 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13629 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13630 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13631 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13632 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13633 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13634 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13635 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13636 *puDst = uDstOut;
13637}
13638
13639
13640IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13641{
13642 RTUINT256U const uSrc2 = *puSrc2;
13643 RTUINT256U const uSrc1 = *puSrc1;
13644 ASMCompilerBarrier();
13645 RTUINT256U uDstOut;
13646 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13647 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13648 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13649 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13650 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13651 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13652 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13653 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13654
13655 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13656 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13657 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13658 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13659 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13660 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13661 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13662 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13663 *puDst = uDstOut;
13664}
13665
13666
13667/*
13668 * PACKUSDW - signed dwords -> unsigned words
13669 */
13670#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13671 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13672 ? (uint16_t)(a_iDword) \
13673 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13674
13675#ifdef IEM_WITHOUT_ASSEMBLY
13676IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13677{
13678 RTUINT128U const uSrc2 = *puSrc;
13679 RTUINT128U const uSrc1 = *puDst;
13680 ASMCompilerBarrier();
13681 RTUINT128U uDstOut;
13682 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13683 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13684 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13685 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13686 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13687 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13688 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13689 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13690 *puDst = uDstOut;
13691}
13692#endif
13693
13694IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13695{
13696 RTUINT128U const uSrc2 = *puSrc2;
13697 RTUINT128U const uSrc1 = *puSrc1;
13698 ASMCompilerBarrier();
13699 RTUINT128U uDstOut;
13700 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13701 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13702 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13703 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13704 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13705 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13706 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13707 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13708 *puDst = uDstOut;
13709}
13710
13711
13712IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13713{
13714 RTUINT256U const uSrc2 = *puSrc2;
13715 RTUINT256U const uSrc1 = *puSrc1;
13716 ASMCompilerBarrier();
13717 RTUINT256U uDstOut;
13718 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13719 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13720 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13721 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13722 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13723 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13724 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13725 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13726
13727 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13728 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13729 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13730 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13731 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13732 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13733 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13734 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13735 *puDst = uDstOut;
13736}
13737
13738
13739/*
13740 * [V]PABSB / [V]PABSW / [V]PABSD
13741 */
13742
13743IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13744{
13745 RTUINT64U const uSrc = { *puSrc };
13746 RTUINT64U uDstOut = { 0 };
13747
13748 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13749 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13750 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13751 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13752 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13753 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13754 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13755 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13756 *puDst = uDstOut.u;
13757 RT_NOREF(pFpuState);
13758}
13759
13760
13761IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13762{
13763 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13764 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13765 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13766 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13767 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13768 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13769 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13770 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13771 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13772 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13773 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13774 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13775 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13776 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13777 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13778 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13779 RT_NOREF(pFpuState);
13780}
13781
13782
13783IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13784{
13785 RTUINT64U const uSrc = { *puSrc };
13786 RTUINT64U uDstOut = { 0 };
13787
13788 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13789 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13790 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13791 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13792 *puDst = uDstOut.u;
13793 RT_NOREF(pFpuState);
13794}
13795
13796
13797IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13798{
13799 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13800 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13801 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13802 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13803 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13804 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13805 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13806 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13807 RT_NOREF(pFpuState);
13808}
13809
13810
13811IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13812{
13813 RTUINT64U const uSrc = { *puSrc };
13814 RTUINT64U uDstOut = { 0 };
13815
13816 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13817 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13818 *puDst = uDstOut.u;
13819 RT_NOREF(pFpuState);
13820}
13821
13822
13823IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13824{
13825 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13826 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13827 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13828 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13829 RT_NOREF(pFpuState);
13830}
13831
13832
13833IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13834{
13835 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13836 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13837 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13838 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13839 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13840 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13841 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13842 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13843 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13844 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13845 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13846 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13847 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13848 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13849 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13850 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13851}
13852
13853
13854IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13855{
13856 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13857 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13858 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13859 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13860 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13861 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13862 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13863 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13864 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13865 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13866 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13867 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13868 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13869 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13870 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13871 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13872 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13873 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13874 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13875 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13876 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13877 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13878 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13879 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13880 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13881 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13882 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13883 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13884 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13885 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13886 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13887 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13888}
13889
13890
13891IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13892{
13893 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13894 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13895 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13896 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13897 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13898 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13899 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13900 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13901}
13902
13903
13904IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13905{
13906 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13907 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13908 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13909 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13910 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13911 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13912 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13913 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13914 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13915 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13916 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13917 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13918 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13919 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13920 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13921 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13922}
13923
13924
13925IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13926{
13927 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13928 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13929 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13930 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13931}
13932
13933
13934IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13935{
13936 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13937 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13938 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13939 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13940 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13941 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13942 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13943 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13944}
13945
13946
13947/*
13948 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13949 */
13950IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13951{
13952 RTUINT64U uSrc1 = { *puDst };
13953 RTUINT64U uSrc2 = { *puSrc };
13954 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13955
13956 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13957 {
13958 if (uSrc2.ai8[i] < 0)
13959 uDst.ai8[i] = -uSrc1.ai8[i];
13960 else if (uSrc2.ai8[i] == 0)
13961 uDst.ai8[i] = 0;
13962 else /* uSrc2.ai8[i] > 0 */
13963 uDst.ai8[i] = uSrc1.ai8[i];
13964 }
13965
13966 *puDst = uDst.u;
13967 RT_NOREF(pFpuState);
13968}
13969
13970
13971IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13972{
13973 RTUINT128U uSrc1 = *puDst;
13974
13975 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13976 {
13977 if (puSrc->ai8[i] < 0)
13978 puDst->ai8[i] = -uSrc1.ai8[i];
13979 else if (puSrc->ai8[i] == 0)
13980 puDst->ai8[i] = 0;
13981 else /* puSrc->ai8[i] > 0 */
13982 puDst->ai8[i] = uSrc1.ai8[i];
13983 }
13984
13985 RT_NOREF(pFpuState);
13986}
13987
13988
13989IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13990{
13991 RTUINT64U uSrc1 = { *puDst };
13992 RTUINT64U uSrc2 = { *puSrc };
13993 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13994
13995 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13996 {
13997 if (uSrc2.ai16[i] < 0)
13998 uDst.ai16[i] = -uSrc1.ai16[i];
13999 else if (uSrc2.ai16[i] == 0)
14000 uDst.ai16[i] = 0;
14001 else /* uSrc2.ai16[i] > 0 */
14002 uDst.ai16[i] = uSrc1.ai16[i];
14003 }
14004
14005 *puDst = uDst.u;
14006 RT_NOREF(pFpuState);
14007}
14008
14009
14010IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14011{
14012 RTUINT128U uSrc1 = *puDst;
14013
14014 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14015 {
14016 if (puSrc->ai16[i] < 0)
14017 puDst->ai16[i] = -uSrc1.ai16[i];
14018 else if (puSrc->ai16[i] == 0)
14019 puDst->ai16[i] = 0;
14020 else /* puSrc->ai16[i] > 0 */
14021 puDst->ai16[i] = uSrc1.ai16[i];
14022 }
14023
14024 RT_NOREF(pFpuState);
14025}
14026
14027
14028IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14029{
14030 RTUINT64U uSrc1 = { *puDst };
14031 RTUINT64U uSrc2 = { *puSrc };
14032 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14033
14034 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
14035 {
14036 if (uSrc2.ai32[i] < 0)
14037 uDst.ai32[i] = -uSrc1.ai32[i];
14038 else if (uSrc2.ai32[i] == 0)
14039 uDst.ai32[i] = 0;
14040 else /* uSrc2.ai32[i] > 0 */
14041 uDst.ai32[i] = uSrc1.ai32[i];
14042 }
14043
14044 *puDst = uDst.u;
14045 RT_NOREF(pFpuState);
14046}
14047
14048
14049IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14050{
14051 RTUINT128U uSrc1 = *puDst;
14052
14053 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14054 {
14055 if (puSrc->ai32[i] < 0)
14056 puDst->ai32[i] = -uSrc1.ai32[i];
14057 else if (puSrc->ai32[i] == 0)
14058 puDst->ai32[i] = 0;
14059 else /* puSrc->ai32[i] > 0 */
14060 puDst->ai32[i] = uSrc1.ai32[i];
14061 }
14062
14063 RT_NOREF(pFpuState);
14064}
14065
14066
14067IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14068{
14069 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
14070 {
14071 if (puSrc2->ai8[i] < 0)
14072 puDst->ai8[i] = -puSrc1->ai8[i];
14073 else if (puSrc2->ai8[i] == 0)
14074 puDst->ai8[i] = 0;
14075 else /* puSrc2->ai8[i] > 0 */
14076 puDst->ai8[i] = puSrc1->ai8[i];
14077 }
14078}
14079
14080
14081IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14082{
14083 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
14084 {
14085 if (puSrc2->ai8[i] < 0)
14086 puDst->ai8[i] = -puSrc1->ai8[i];
14087 else if (puSrc2->ai8[i] == 0)
14088 puDst->ai8[i] = 0;
14089 else /* puSrc2->ai8[i] > 0 */
14090 puDst->ai8[i] = puSrc1->ai8[i];
14091 }
14092}
14093
14094
14095IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14096{
14097 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14098 {
14099 if (puSrc2->ai16[i] < 0)
14100 puDst->ai16[i] = -puSrc1->ai16[i];
14101 else if (puSrc2->ai16[i] == 0)
14102 puDst->ai16[i] = 0;
14103 else /* puSrc2->ai16[i] > 0 */
14104 puDst->ai16[i] = puSrc1->ai16[i];
14105 }
14106}
14107
14108
14109IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14110{
14111 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14112 {
14113 if (puSrc2->ai16[i] < 0)
14114 puDst->ai16[i] = -puSrc1->ai16[i];
14115 else if (puSrc2->ai16[i] == 0)
14116 puDst->ai16[i] = 0;
14117 else /* puSrc2->ai16[i] > 0 */
14118 puDst->ai16[i] = puSrc1->ai16[i];
14119 }
14120}
14121
14122
14123IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14124{
14125 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14126 {
14127 if (puSrc2->ai32[i] < 0)
14128 puDst->ai32[i] = -puSrc1->ai32[i];
14129 else if (puSrc2->ai32[i] == 0)
14130 puDst->ai32[i] = 0;
14131 else /* puSrc2->ai32[i] > 0 */
14132 puDst->ai32[i] = puSrc1->ai32[i];
14133 }
14134}
14135
14136
14137IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14138{
14139 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14140 {
14141 if (puSrc2->ai32[i] < 0)
14142 puDst->ai32[i] = -puSrc1->ai32[i];
14143 else if (puSrc2->ai32[i] == 0)
14144 puDst->ai32[i] = 0;
14145 else /* puSrc2->ai32[i] > 0 */
14146 puDst->ai32[i] = puSrc1->ai32[i];
14147 }
14148}
14149
14150
14151/*
14152 * PHADDW / VPHADDW / PHADDD / VPHADDD
14153 */
14154IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14155{
14156 RTUINT64U uSrc1 = { *puDst };
14157 RTUINT64U uSrc2 = { *puSrc };
14158 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14159
14160 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14161 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14162 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14163 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14164 *puDst = uDst.u;
14165 RT_NOREF(pFpuState);
14166}
14167
14168
14169IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14170{
14171 RTUINT128U uSrc1 = *puDst;
14172
14173 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14174 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14175 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14176 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14177
14178 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14179 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14180 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14181 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14182 RT_NOREF(pFpuState);
14183}
14184
14185
14186IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14187{
14188 RTUINT64U uSrc1 = { *puDst };
14189 RTUINT64U uSrc2 = { *puSrc };
14190 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14191
14192 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14193 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14194 *puDst = uDst.u;
14195 RT_NOREF(pFpuState);
14196}
14197
14198
14199IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14200{
14201 RTUINT128U uSrc1 = *puDst;
14202
14203 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14204 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14205
14206 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14207 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14208 RT_NOREF(pFpuState);
14209}
14210
14211
14212IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14213{
14214 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14215
14216 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14217 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14218 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14219 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14220
14221 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14222 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14223 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14224 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14225
14226 puDst->au64[0] = uDst.au64[0];
14227 puDst->au64[1] = uDst.au64[1];
14228}
14229
14230
14231IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14232{
14233 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14234
14235 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14236 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14237 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14238 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14239 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14240 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14241 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14242 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14243
14244 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14245 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14246 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14247 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14248 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14249 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14250 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14251 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14252
14253 puDst->au64[0] = uDst.au64[0];
14254 puDst->au64[1] = uDst.au64[1];
14255 puDst->au64[2] = uDst.au64[2];
14256 puDst->au64[3] = uDst.au64[3];
14257}
14258
14259
14260IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14261{
14262 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14263
14264 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14265 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14266
14267 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14268 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14269
14270 puDst->au64[0] = uDst.au64[0];
14271 puDst->au64[1] = uDst.au64[1];
14272}
14273
14274
14275IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14276{
14277 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14278
14279 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14280 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14281 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14282 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14283
14284 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14285 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14286 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14287 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14288
14289 puDst->au64[0] = uDst.au64[0];
14290 puDst->au64[1] = uDst.au64[1];
14291 puDst->au64[2] = uDst.au64[2];
14292 puDst->au64[3] = uDst.au64[3];
14293}
14294
14295
14296/*
14297 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14298 */
14299IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14300{
14301 RTUINT64U uSrc1 = { *puDst };
14302 RTUINT64U uSrc2 = { *puSrc };
14303 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14304
14305 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14306 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14307 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14308 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14309 *puDst = uDst.u;
14310 RT_NOREF(pFpuState);
14311}
14312
14313
14314IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14315{
14316 RTUINT128U uSrc1 = *puDst;
14317
14318 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14319 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14320 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14321 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14322
14323 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14324 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14325 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14326 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14327 RT_NOREF(pFpuState);
14328}
14329
14330
14331IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14332{
14333 RTUINT64U uSrc1 = { *puDst };
14334 RTUINT64U uSrc2 = { *puSrc };
14335 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14336
14337 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14338 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14339 *puDst = uDst.u;
14340 RT_NOREF(pFpuState);
14341}
14342
14343
14344IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14345{
14346 RTUINT128U uSrc1 = *puDst;
14347
14348 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14349 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14350
14351 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14352 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14353 RT_NOREF(pFpuState);
14354}
14355
14356
14357IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14358{
14359 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14360
14361 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14362 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14363 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14364 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14365
14366 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14367 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14368 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14369 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14370
14371 puDst->au64[0] = uDst.au64[0];
14372 puDst->au64[1] = uDst.au64[1];
14373}
14374
14375
14376IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14377{
14378 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14379
14380 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14381 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14382 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14383 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14384 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14385 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14386 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14387 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14388
14389 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14390 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14391 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14392 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14393 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14394 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14395 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14396 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14397
14398 puDst->au64[0] = uDst.au64[0];
14399 puDst->au64[1] = uDst.au64[1];
14400 puDst->au64[2] = uDst.au64[2];
14401 puDst->au64[3] = uDst.au64[3];
14402}
14403
14404
14405IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14406{
14407 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14408
14409 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14410 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14411
14412 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14413 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14414
14415 puDst->au64[0] = uDst.au64[0];
14416 puDst->au64[1] = uDst.au64[1];
14417}
14418
14419
14420IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14421{
14422 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14423
14424 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14425 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14426 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14427 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14428
14429 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14430 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14431 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14432 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14433
14434 puDst->au64[0] = uDst.au64[0];
14435 puDst->au64[1] = uDst.au64[1];
14436 puDst->au64[2] = uDst.au64[2];
14437 puDst->au64[3] = uDst.au64[3];
14438}
14439
14440
14441/*
14442 * PHADDSW / VPHADDSW
14443 */
14444IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14445{
14446 RTUINT64U uSrc1 = { *puDst };
14447 RTUINT64U uSrc2 = { *puSrc };
14448 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14449
14450 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14451 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14452 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14453 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14454 *puDst = uDst.u;
14455 RT_NOREF(pFpuState);
14456}
14457
14458
14459IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14460{
14461 RTUINT128U uSrc1 = *puDst;
14462
14463 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14464 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14465 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14466 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14467
14468 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14469 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14470 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14471 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14472 RT_NOREF(pFpuState);
14473}
14474
14475
14476IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14477{
14478 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14479
14480 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14481 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14482 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14483 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14484
14485 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14486 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14487 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14488 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14489
14490 puDst->au64[0] = uDst.au64[0];
14491 puDst->au64[1] = uDst.au64[1];
14492}
14493
14494
14495IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14496{
14497 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14498
14499 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14500 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14501 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14502 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14503 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14504 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14505 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14506 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14507
14508 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14509 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14510 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14511 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14512 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14513 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14514 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14515 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14516
14517 puDst->au64[0] = uDst.au64[0];
14518 puDst->au64[1] = uDst.au64[1];
14519 puDst->au64[2] = uDst.au64[2];
14520 puDst->au64[3] = uDst.au64[3];
14521}
14522
14523
14524/*
14525 * PHSUBSW / VPHSUBSW
14526 */
14527IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14528{
14529 RTUINT64U uSrc1 = { *puDst };
14530 RTUINT64U uSrc2 = { *puSrc };
14531 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14532
14533 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14534 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14535 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14536 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14537 *puDst = uDst.u;
14538 RT_NOREF(pFpuState);
14539}
14540
14541
14542IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14543{
14544 RTUINT128U uSrc1 = *puDst;
14545
14546 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14547 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14548 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14549 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14550
14551 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14552 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14553 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14554 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14555 RT_NOREF(pFpuState);
14556}
14557
14558
14559IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14560{
14561 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14562
14563 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14564 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14565 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14566 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14567
14568 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14569 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14570 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14571 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14572
14573 puDst->au64[0] = uDst.au64[0];
14574 puDst->au64[1] = uDst.au64[1];
14575}
14576
14577
14578IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14579{
14580 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14581
14582 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14583 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14584 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14585 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14586 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14587 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14588 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14589 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14590
14591 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14592 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14593 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14594 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14595 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14596 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14597 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14598 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14599
14600 puDst->au64[0] = uDst.au64[0];
14601 puDst->au64[1] = uDst.au64[1];
14602 puDst->au64[2] = uDst.au64[2];
14603 puDst->au64[3] = uDst.au64[3];
14604}
14605
14606
14607/*
14608 * PMADDUBSW / VPMADDUBSW
14609 */
14610IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14611{
14612 RTUINT64U uSrc1 = { *puDst };
14613 RTUINT64U uSrc2 = { *puSrc };
14614 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14615
14616 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14617 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14618 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14619 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14620 *puDst = uDst.u;
14621 RT_NOREF(pFpuState);
14622}
14623
14624
14625IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14626{
14627 RTUINT128U uSrc1 = *puDst;
14628
14629 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14630 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14631 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14632 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14633 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14634 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14635 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14636 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14637 RT_NOREF(pFpuState);
14638}
14639
14640
14641IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14642{
14643 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14644
14645 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14646 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14647 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14648 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14649 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14650 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14651 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14652 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14653
14654 puDst->au64[0] = uDst.au64[0];
14655 puDst->au64[1] = uDst.au64[1];
14656}
14657
14658
14659IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14660{
14661 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14662
14663 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14664 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14665 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14666 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14667 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14668 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14669 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14670 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14671 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14672 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14673 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14674 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14675 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14676 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14677 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14678 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14679
14680 puDst->au64[0] = uDst.au64[0];
14681 puDst->au64[1] = uDst.au64[1];
14682 puDst->au64[2] = uDst.au64[2];
14683 puDst->au64[3] = uDst.au64[3];
14684}
14685
14686
14687/*
14688 * PMULHRSW / VPMULHRSW
14689 */
14690#define DO_PMULHRSW(a_Src1, a_Src2) \
14691 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14692
14693IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14694{
14695 RTUINT64U uSrc1 = { *puDst };
14696 RTUINT64U uSrc2 = { *puSrc };
14697 RTUINT64U uDst;
14698
14699 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14700 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14701 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14702 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14703 *puDst = uDst.u;
14704 RT_NOREF(pFpuState);
14705}
14706
14707
14708IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14709{
14710 RTUINT128U uSrc1 = *puDst;
14711
14712 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14713 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14714 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14715 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14716 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14717 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14718 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14719 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14720 RT_NOREF(pFpuState);
14721}
14722
14723
14724IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14725{
14726 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14727
14728 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14729 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14730 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14731 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14732 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14733 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14734 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14735 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14736
14737 puDst->au64[0] = uDst.au64[0];
14738 puDst->au64[1] = uDst.au64[1];
14739}
14740
14741
14742IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14743{
14744 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14745
14746 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14747 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14748 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14749 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14750 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14751 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14752 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14753 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14754 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14755 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14756 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14757 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14758 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14759 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14760 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14761 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14762
14763 puDst->au64[0] = uDst.au64[0];
14764 puDst->au64[1] = uDst.au64[1];
14765 puDst->au64[2] = uDst.au64[2];
14766 puDst->au64[3] = uDst.au64[3];
14767}
14768
14769
14770/*
14771 * PSADBW / VPSADBW
14772 */
14773#ifdef IEM_WITHOUT_ASSEMBLY
14774
14775IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14776{
14777 RTUINT64U uSrc1 = { *puDst };
14778 RTUINT64U uSrc2 = { *puSrc };
14779 RTUINT64U uDst;
14780 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14781 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14782 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14783 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14784 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14785 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14786 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14787 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14788
14789 uDst.au64[0] = 0;
14790 uDst.au16[0] = uSum;
14791 *puDst = uDst.u;
14792}
14793
14794
14795IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14796{
14797 RTUINT128U uSrc1 = *puDst;
14798
14799 puDst->au64[0] = 0;
14800 puDst->au64[1] = 0;
14801
14802 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14803 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14804 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14805 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14806 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14807 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14808 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14809 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14810 puDst->au16[0] = uSum;
14811
14812 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14813 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14814 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14815 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14816 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14817 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14818 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14819 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14820 puDst->au16[4] = uSum;
14821}
14822
14823#endif
14824
14825IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14826{
14827 RTUINT128U uSrc1 = *puSrc1;
14828 RTUINT128U uSrc2 = *puSrc2;
14829
14830 puDst->au64[0] = 0;
14831 puDst->au64[1] = 0;
14832
14833 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14834 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14835 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14836 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14837 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14838 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14839 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14840 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14841 puDst->au16[0] = uSum;
14842
14843 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14844 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14845 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14846 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14847 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14848 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14849 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14850 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14851 puDst->au16[4] = uSum;
14852}
14853
14854IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14855{
14856 RTUINT256U uSrc1 = *puSrc1;
14857 RTUINT256U uSrc2 = *puSrc2;
14858
14859 puDst->au64[0] = 0;
14860 puDst->au64[1] = 0;
14861 puDst->au64[2] = 0;
14862 puDst->au64[3] = 0;
14863
14864 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14865 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14866 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14867 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14868 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14869 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14870 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14871 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14872 puDst->au16[0] = uSum;
14873
14874 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14875 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14876 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14877 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14878 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14879 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14880 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14881 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14882 puDst->au16[4] = uSum;
14883
14884 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14885 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14886 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14887 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14888 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14889 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14890 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14891 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14892 puDst->au16[8] = uSum;
14893
14894 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14895 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14896 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14897 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14898 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14899 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14900 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14901 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14902 puDst->au16[12] = uSum;
14903}
14904
14905
14906/*
14907 * PMULDQ / VPMULDQ
14908 */
14909IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14910{
14911 RTUINT128U uSrc1 = *puDst;
14912
14913 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14914 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14915}
14916
14917IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14918{
14919 RTUINT128U uSrc1 = *puSrc1;
14920 RTUINT128U uSrc2 = *puSrc2;
14921
14922 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14923 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14924}
14925
14926IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14927{
14928 RTUINT256U uSrc1 = *puSrc1;
14929 RTUINT256U uSrc2 = *puSrc2;
14930
14931 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14932 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14933 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14934 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14935}
14936
14937
14938/*
14939 * PMULUDQ / VPMULUDQ
14940 */
14941#ifdef IEM_WITHOUT_ASSEMBLY
14942
14943IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14944{
14945 RTUINT64U uSrc1 = { *puDst };
14946 RTUINT64U uSrc2 = { *puSrc };
14947 ASMCompilerBarrier();
14948 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14949 RT_NOREF(pFpuState);
14950}
14951
14952
14953IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14954{
14955 RTUINT128U uSrc1 = *puDst;
14956 RTUINT128U uSrc2 = *puSrc;
14957 ASMCompilerBarrier();
14958 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14959 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14960 RT_NOREF(pFpuState);
14961}
14962
14963#endif
14964
14965IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14966{
14967 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14968 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14969 ASMCompilerBarrier();
14970 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14971 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14972}
14973
14974
14975IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14976{
14977 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14978 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14979 ASMCompilerBarrier();
14980 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14981 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14982 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14983 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14984}
14985
14986
14987/*
14988 * UNPCKLPS / VUNPCKLPS
14989 */
14990#ifdef IEM_WITHOUT_ASSEMBLY
14991IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14992{
14993 RTUINT128U uSrc1 = *puDst;
14994 RTUINT128U uSrc2 = *puSrc;
14995 ASMCompilerBarrier();
14996 puDst->au32[0] = uSrc1.au32[0];
14997 puDst->au32[1] = uSrc2.au32[0];
14998 puDst->au32[2] = uSrc1.au32[1];
14999 puDst->au32[3] = uSrc2.au32[1];
15000}
15001
15002#endif
15003
15004IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15005{
15006 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15007 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15008 ASMCompilerBarrier();
15009 puDst->au32[0] = uSrc1.au32[0];
15010 puDst->au32[1] = uSrc2.au32[0];
15011 puDst->au32[2] = uSrc1.au32[1];
15012 puDst->au32[3] = uSrc2.au32[1];
15013}
15014
15015
15016IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15017{
15018 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15019 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15020 ASMCompilerBarrier();
15021 puDst->au32[0] = uSrc1.au32[0];
15022 puDst->au32[1] = uSrc2.au32[0];
15023 puDst->au32[2] = uSrc1.au32[1];
15024 puDst->au32[3] = uSrc2.au32[1];
15025
15026 puDst->au32[4] = uSrc1.au32[4];
15027 puDst->au32[5] = uSrc2.au32[4];
15028 puDst->au32[6] = uSrc1.au32[5];
15029 puDst->au32[7] = uSrc2.au32[5];
15030}
15031
15032
15033/*
15034 * UNPCKLPD / VUNPCKLPD
15035 */
15036#ifdef IEM_WITHOUT_ASSEMBLY
15037IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15038{
15039 RTUINT128U uSrc1 = *puDst;
15040 RTUINT128U uSrc2 = *puSrc;
15041 ASMCompilerBarrier();
15042 puDst->au64[0] = uSrc1.au64[0];
15043 puDst->au64[1] = uSrc2.au64[0];
15044}
15045
15046#endif
15047
15048IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15049{
15050 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15051 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15052 ASMCompilerBarrier();
15053 puDst->au64[0] = uSrc1.au64[0];
15054 puDst->au64[1] = uSrc2.au64[0];
15055}
15056
15057
15058IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15059{
15060 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15061 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15062 ASMCompilerBarrier();
15063 puDst->au64[0] = uSrc1.au64[0];
15064 puDst->au64[1] = uSrc2.au64[0];
15065 puDst->au64[2] = uSrc1.au64[2];
15066 puDst->au64[3] = uSrc2.au64[2];
15067}
15068
15069
15070/*
15071 * UNPCKHPS / VUNPCKHPS
15072 */
15073#ifdef IEM_WITHOUT_ASSEMBLY
15074IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15075{
15076 RTUINT128U uSrc1 = *puDst;
15077 RTUINT128U uSrc2 = *puSrc;
15078 ASMCompilerBarrier();
15079 puDst->au32[0] = uSrc1.au32[2];
15080 puDst->au32[1] = uSrc2.au32[2];
15081 puDst->au32[2] = uSrc1.au32[3];
15082 puDst->au32[3] = uSrc2.au32[3];
15083}
15084
15085#endif
15086
15087IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15088{
15089 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15090 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15091 ASMCompilerBarrier();
15092 puDst->au32[0] = uSrc1.au32[2];
15093 puDst->au32[1] = uSrc2.au32[2];
15094 puDst->au32[2] = uSrc1.au32[3];
15095 puDst->au32[3] = uSrc2.au32[3];
15096}
15097
15098
15099IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15100{
15101 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15102 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15103 ASMCompilerBarrier();
15104 puDst->au32[0] = uSrc1.au32[2];
15105 puDst->au32[1] = uSrc2.au32[2];
15106 puDst->au32[2] = uSrc1.au32[3];
15107 puDst->au32[3] = uSrc2.au32[3];
15108
15109 puDst->au32[4] = uSrc1.au32[6];
15110 puDst->au32[5] = uSrc2.au32[6];
15111 puDst->au32[6] = uSrc1.au32[7];
15112 puDst->au32[7] = uSrc2.au32[7];
15113}
15114
15115
15116/*
15117 * UNPCKHPD / VUNPCKHPD
15118 */
15119#ifdef IEM_WITHOUT_ASSEMBLY
15120IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15121{
15122 RTUINT128U uSrc1 = *puDst;
15123 RTUINT128U uSrc2 = *puSrc;
15124 ASMCompilerBarrier();
15125 puDst->au64[0] = uSrc1.au64[1];
15126 puDst->au64[1] = uSrc2.au64[1];
15127}
15128
15129#endif
15130
15131IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15132{
15133 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15134 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15135 ASMCompilerBarrier();
15136 puDst->au64[0] = uSrc1.au64[1];
15137 puDst->au64[1] = uSrc2.au64[1];
15138}
15139
15140
15141IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15142{
15143 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15144 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15145 ASMCompilerBarrier();
15146 puDst->au64[0] = uSrc1.au64[1];
15147 puDst->au64[1] = uSrc2.au64[1];
15148 puDst->au64[2] = uSrc1.au64[3];
15149 puDst->au64[3] = uSrc2.au64[3];
15150}
15151
15152
15153/*
15154 * CRC32 (SEE 4.2).
15155 */
15156
15157IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
15158{
15159 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15160}
15161
15162
15163IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15164{
15165 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15166}
15167
15168IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15169{
15170 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15171}
15172
15173IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15174{
15175 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15176}
15177
15178
15179/*
15180 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15181 */
15182#ifdef IEM_WITHOUT_ASSEMBLY
15183IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15184{
15185 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15186 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15187 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15188 fEfl |= X86_EFL_ZF;
15189 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15190 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15191 fEfl |= X86_EFL_CF;
15192 *pfEFlags = fEfl;
15193}
15194#endif
15195
15196IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15197{
15198 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15199 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15200 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15201 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15202 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15203 fEfl |= X86_EFL_ZF;
15204 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15205 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15206 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15207 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15208 fEfl |= X86_EFL_CF;
15209 *pfEFlags = fEfl;
15210}
15211
15212
15213/*
15214 * PMOVSXBW / VPMOVSXBW
15215 */
15216IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15217{
15218 RTUINT64U uSrc1 = { uSrc };
15219 puDst->ai16[0] = uSrc1.ai8[0];
15220 puDst->ai16[1] = uSrc1.ai8[1];
15221 puDst->ai16[2] = uSrc1.ai8[2];
15222 puDst->ai16[3] = uSrc1.ai8[3];
15223 puDst->ai16[4] = uSrc1.ai8[4];
15224 puDst->ai16[5] = uSrc1.ai8[5];
15225 puDst->ai16[6] = uSrc1.ai8[6];
15226 puDst->ai16[7] = uSrc1.ai8[7];
15227}
15228
15229
15230IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15231{
15232 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15233 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15234 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15235 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15236 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15237 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15238 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15239 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15240 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15241 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15242 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15243 puDst->ai16[10] = uSrc1.ai8[10];
15244 puDst->ai16[11] = uSrc1.ai8[11];
15245 puDst->ai16[12] = uSrc1.ai8[12];
15246 puDst->ai16[13] = uSrc1.ai8[13];
15247 puDst->ai16[14] = uSrc1.ai8[14];
15248 puDst->ai16[15] = uSrc1.ai8[15];
15249}
15250
15251
15252/*
15253 * PMOVSXBD / VPMOVSXBD
15254 */
15255IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15256{
15257 RTUINT32U uSrc1 = { uSrc };
15258 puDst->ai32[0] = uSrc1.ai8[0];
15259 puDst->ai32[1] = uSrc1.ai8[1];
15260 puDst->ai32[2] = uSrc1.ai8[2];
15261 puDst->ai32[3] = uSrc1.ai8[3];
15262}
15263
15264
15265IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15266{
15267 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15268 puDst->ai32[0] = uSrc1.ai8[0];
15269 puDst->ai32[1] = uSrc1.ai8[1];
15270 puDst->ai32[2] = uSrc1.ai8[2];
15271 puDst->ai32[3] = uSrc1.ai8[3];
15272 puDst->ai32[4] = uSrc1.ai8[4];
15273 puDst->ai32[5] = uSrc1.ai8[5];
15274 puDst->ai32[6] = uSrc1.ai8[6];
15275 puDst->ai32[7] = uSrc1.ai8[7];
15276}
15277
15278
15279/*
15280 * PMOVSXBQ / VPMOVSXBQ
15281 */
15282IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15283{
15284 RTUINT16U uSrc1 = { uSrc };
15285 puDst->ai64[0] = uSrc1.ai8[0];
15286 puDst->ai64[1] = uSrc1.ai8[1];
15287}
15288
15289
15290IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15291{
15292 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15293 puDst->ai64[0] = uSrc1.ai8[0];
15294 puDst->ai64[1] = uSrc1.ai8[1];
15295 puDst->ai64[2] = uSrc1.ai8[2];
15296 puDst->ai64[3] = uSrc1.ai8[3];
15297}
15298
15299
15300/*
15301 * PMOVSXWD / VPMOVSXWD
15302 */
15303IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15304{
15305 RTUINT64U uSrc1 = { uSrc };
15306 puDst->ai32[0] = uSrc1.ai16[0];
15307 puDst->ai32[1] = uSrc1.ai16[1];
15308 puDst->ai32[2] = uSrc1.ai16[2];
15309 puDst->ai32[3] = uSrc1.ai16[3];
15310}
15311
15312
15313IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15314{
15315 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15316 puDst->ai32[0] = uSrc1.ai16[0];
15317 puDst->ai32[1] = uSrc1.ai16[1];
15318 puDst->ai32[2] = uSrc1.ai16[2];
15319 puDst->ai32[3] = uSrc1.ai16[3];
15320 puDst->ai32[4] = uSrc1.ai16[4];
15321 puDst->ai32[5] = uSrc1.ai16[5];
15322 puDst->ai32[6] = uSrc1.ai16[6];
15323 puDst->ai32[7] = uSrc1.ai16[7];
15324}
15325
15326
15327/*
15328 * PMOVSXWQ / VPMOVSXWQ
15329 */
15330IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15331{
15332 RTUINT32U uSrc1 = { uSrc };
15333 puDst->ai64[0] = uSrc1.ai16[0];
15334 puDst->ai64[1] = uSrc1.ai16[1];
15335}
15336
15337
15338IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15339{
15340 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15341 puDst->ai64[0] = uSrc1.ai16[0];
15342 puDst->ai64[1] = uSrc1.ai16[1];
15343 puDst->ai64[2] = uSrc1.ai16[2];
15344 puDst->ai64[3] = uSrc1.ai16[3];
15345}
15346
15347
15348/*
15349 * PMOVSXDQ / VPMOVSXDQ
15350 */
15351IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15352{
15353 RTUINT64U uSrc1 = { uSrc };
15354 puDst->ai64[0] = uSrc1.ai32[0];
15355 puDst->ai64[1] = uSrc1.ai32[1];
15356}
15357
15358
15359IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15360{
15361 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15362 puDst->ai64[0] = uSrc1.ai32[0];
15363 puDst->ai64[1] = uSrc1.ai32[1];
15364 puDst->ai64[2] = uSrc1.ai32[2];
15365 puDst->ai64[3] = uSrc1.ai32[3];
15366}
15367
15368
15369/*
15370 * PMOVZXBW / VPMOVZXBW
15371 */
15372IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15373{
15374 RTUINT64U uSrc1 = { uSrc };
15375 puDst->au16[0] = uSrc1.au8[0];
15376 puDst->au16[1] = uSrc1.au8[1];
15377 puDst->au16[2] = uSrc1.au8[2];
15378 puDst->au16[3] = uSrc1.au8[3];
15379 puDst->au16[4] = uSrc1.au8[4];
15380 puDst->au16[5] = uSrc1.au8[5];
15381 puDst->au16[6] = uSrc1.au8[6];
15382 puDst->au16[7] = uSrc1.au8[7];
15383}
15384
15385
15386IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15387{
15388 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15389 puDst->au16[ 0] = uSrc1.au8[ 0];
15390 puDst->au16[ 1] = uSrc1.au8[ 1];
15391 puDst->au16[ 2] = uSrc1.au8[ 2];
15392 puDst->au16[ 3] = uSrc1.au8[ 3];
15393 puDst->au16[ 4] = uSrc1.au8[ 4];
15394 puDst->au16[ 5] = uSrc1.au8[ 5];
15395 puDst->au16[ 6] = uSrc1.au8[ 6];
15396 puDst->au16[ 7] = uSrc1.au8[ 7];
15397 puDst->au16[ 8] = uSrc1.au8[ 8];
15398 puDst->au16[ 9] = uSrc1.au8[ 9];
15399 puDst->au16[10] = uSrc1.au8[10];
15400 puDst->au16[11] = uSrc1.au8[11];
15401 puDst->au16[12] = uSrc1.au8[12];
15402 puDst->au16[13] = uSrc1.au8[13];
15403 puDst->au16[14] = uSrc1.au8[14];
15404 puDst->au16[15] = uSrc1.au8[15];
15405}
15406
15407
15408/*
15409 * PMOVZXBD / VPMOVZXBD
15410 */
15411IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15412{
15413 RTUINT32U uSrc1 = { uSrc };
15414 puDst->au32[0] = uSrc1.au8[0];
15415 puDst->au32[1] = uSrc1.au8[1];
15416 puDst->au32[2] = uSrc1.au8[2];
15417 puDst->au32[3] = uSrc1.au8[3];
15418}
15419
15420
15421IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15422{
15423 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15424 puDst->au32[0] = uSrc1.au8[0];
15425 puDst->au32[1] = uSrc1.au8[1];
15426 puDst->au32[2] = uSrc1.au8[2];
15427 puDst->au32[3] = uSrc1.au8[3];
15428 puDst->au32[4] = uSrc1.au8[4];
15429 puDst->au32[5] = uSrc1.au8[5];
15430 puDst->au32[6] = uSrc1.au8[6];
15431 puDst->au32[7] = uSrc1.au8[7];
15432}
15433
15434
15435/*
15436 * PMOVZXBQ / VPMOVZXBQ
15437 */
15438IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15439{
15440 RTUINT16U uSrc1 = { uSrc };
15441 puDst->au64[0] = uSrc1.au8[0];
15442 puDst->au64[1] = uSrc1.au8[1];
15443}
15444
15445
15446IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15447{
15448 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15449 puDst->au64[0] = uSrc1.au8[0];
15450 puDst->au64[1] = uSrc1.au8[1];
15451 puDst->au64[2] = uSrc1.au8[2];
15452 puDst->au64[3] = uSrc1.au8[3];
15453}
15454
15455
15456/*
15457 * PMOVZXWD / VPMOVZXWD
15458 */
15459IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15460{
15461 RTUINT64U uSrc1 = { uSrc };
15462 puDst->au32[0] = uSrc1.au16[0];
15463 puDst->au32[1] = uSrc1.au16[1];
15464 puDst->au32[2] = uSrc1.au16[2];
15465 puDst->au32[3] = uSrc1.au16[3];
15466}
15467
15468
15469IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15470{
15471 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15472 puDst->au32[0] = uSrc1.au16[0];
15473 puDst->au32[1] = uSrc1.au16[1];
15474 puDst->au32[2] = uSrc1.au16[2];
15475 puDst->au32[3] = uSrc1.au16[3];
15476 puDst->au32[4] = uSrc1.au16[4];
15477 puDst->au32[5] = uSrc1.au16[5];
15478 puDst->au32[6] = uSrc1.au16[6];
15479 puDst->au32[7] = uSrc1.au16[7];
15480}
15481
15482
15483/*
15484 * PMOVZXWQ / VPMOVZXWQ
15485 */
15486IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15487{
15488 RTUINT32U uSrc1 = { uSrc };
15489 puDst->au64[0] = uSrc1.au16[0];
15490 puDst->au64[1] = uSrc1.au16[1];
15491}
15492
15493
15494IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15495{
15496 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15497 puDst->au64[0] = uSrc1.au16[0];
15498 puDst->au64[1] = uSrc1.au16[1];
15499 puDst->au64[2] = uSrc1.au16[2];
15500 puDst->au64[3] = uSrc1.au16[3];
15501}
15502
15503
15504/*
15505 * PMOVZXDQ / VPMOVZXDQ
15506 */
15507IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15508{
15509 RTUINT64U uSrc1 = { uSrc };
15510 puDst->au64[0] = uSrc1.au32[0];
15511 puDst->au64[1] = uSrc1.au32[1];
15512}
15513
15514
15515IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15516{
15517 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15518 puDst->au64[0] = uSrc1.au32[0];
15519 puDst->au64[1] = uSrc1.au32[1];
15520 puDst->au64[2] = uSrc1.au32[2];
15521 puDst->au64[3] = uSrc1.au32[3];
15522}
15523
15524/**
15525 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15526 * the SoftFloat 32-bit floating point format (float32_t).
15527 *
15528 * This is only a structure format conversion, nothing else.
15529 */
15530DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15531{
15532 float32_t Tmp;
15533 Tmp.v = pr32Val->u;
15534 return Tmp;
15535}
15536
15537
15538/**
15539 * Converts from SoftFloat 32-bit floating point format (float32_t)
15540 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15541 *
15542 * This is only a structure format conversion, nothing else.
15543 */
15544DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15545{
15546 pr32Dst->u = r32XSrc.v;
15547 return pr32Dst;
15548}
15549
15550
15551/**
15552 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15553 * the SoftFloat 64-bit floating point format (float64_t).
15554 *
15555 * This is only a structure format conversion, nothing else.
15556 */
15557DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15558{
15559 float64_t Tmp;
15560 Tmp.v = pr64Val->u;
15561 return Tmp;
15562}
15563
15564
15565/**
15566 * Converts from SoftFloat 64-bit floating point format (float64_t)
15567 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15568 *
15569 * This is only a structure format conversion, nothing else.
15570 */
15571DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15572{
15573 pr64Dst->u = r64XSrc.v;
15574 return pr64Dst;
15575}
15576
15577
15578/** Initializer for the SoftFloat state structure. */
15579# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15580 { \
15581 softfloat_tininess_afterRounding, \
15582 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15583 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15584 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15585 : (uint8_t)softfloat_round_minMag, \
15586 0, \
15587 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15588 32 /* Rounding precision, not relevant for SIMD. */ \
15589 }
15590
15591#ifdef IEM_WITHOUT_ASSEMBLY
15592
15593/**
15594 * Helper for transfering exception to MXCSR and setting the result value
15595 * accordingly.
15596 *
15597 * @returns Updated MXCSR.
15598 * @param pSoftState The SoftFloat state following the operation.
15599 * @param r32Result The result of the SoftFloat operation.
15600 * @param pr32Result Where to store the result for IEM.
15601 * @param fMxcsr The original MXCSR value.
15602 */
15603DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15604 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15605{
15606 iemFpSoftF32ToIprt(pr32Result, r32Result);
15607
15608 uint8_t fXcpt = pSoftState->exceptionFlags;
15609 if ( (fMxcsr & X86_MXCSR_FZ)
15610 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15611 {
15612 /* Underflow masked and flush to zero is set. */
15613 pr32Result->s.uFraction = 0;
15614 pr32Result->s.uExponent = 0;
15615 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15616 }
15617
15618 /* If DAZ is set \#DE is never set. */
15619 if ( fMxcsr & X86_MXCSR_DAZ
15620 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15621 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15622 fXcpt &= ~X86_MXCSR_DE;
15623
15624 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15625}
15626
15627
15628/**
15629 * Helper for transfering exception to MXCSR and setting the result value
15630 * accordingly - ignores Flush-to-Zero.
15631 *
15632 * @returns Updated MXCSR.
15633 * @param pSoftState The SoftFloat state following the operation.
15634 * @param r32Result The result of the SoftFloat operation.
15635 * @param pr32Result Where to store the result for IEM.
15636 * @param fMxcsr The original MXCSR value.
15637 */
15638DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15639 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15640{
15641 iemFpSoftF32ToIprt(pr32Result, r32Result);
15642
15643 uint8_t fXcpt = pSoftState->exceptionFlags;
15644 /* If DAZ is set \#DE is never set. */
15645 if ( fMxcsr & X86_MXCSR_DAZ
15646 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15647 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15648 fXcpt &= ~X86_MXCSR_DE;
15649
15650 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15651}
15652
15653
15654/**
15655 * Helper for transfering exception to MXCSR and setting the result value
15656 * accordingly.
15657 *
15658 * @returns Updated MXCSR.
15659 * @param pSoftState The SoftFloat state following the operation.
15660 * @param r64Result The result of the SoftFloat operation.
15661 * @param pr64Result Where to store the result for IEM.
15662 * @param fMxcsr The original MXCSR value.
15663 */
15664DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15665 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15666{
15667 iemFpSoftF64ToIprt(pr64Result, r64Result);
15668 uint8_t fXcpt = pSoftState->exceptionFlags;
15669 if ( (fMxcsr & X86_MXCSR_FZ)
15670 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15671 {
15672 /* Underflow masked and flush to zero is set. */
15673 iemFpSoftF64ToIprt(pr64Result, r64Result);
15674 pr64Result->s.uFractionHigh = 0;
15675 pr64Result->s.uFractionLow = 0;
15676 pr64Result->s.uExponent = 0;
15677 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15678 }
15679
15680 /* If DAZ is set \#DE is never set. */
15681 if ( fMxcsr & X86_MXCSR_DAZ
15682 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15683 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15684 fXcpt &= ~X86_MXCSR_DE;
15685
15686 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15687}
15688
15689
15690/**
15691 * Helper for transfering exception to MXCSR and setting the result value
15692 * accordingly - ignores Flush-to-Zero.
15693 *
15694 * @returns Updated MXCSR.
15695 * @param pSoftState The SoftFloat state following the operation.
15696 * @param r64Result The result of the SoftFloat operation.
15697 * @param pr64Result Where to store the result for IEM.
15698 * @param fMxcsr The original MXCSR value.
15699 */
15700DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15701 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15702{
15703 iemFpSoftF64ToIprt(pr64Result, r64Result);
15704
15705 uint8_t fXcpt = pSoftState->exceptionFlags;
15706 /* If DAZ is set \#DE is never set. */
15707 if ( fMxcsr & X86_MXCSR_DAZ
15708 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15709 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15710 fXcpt &= ~X86_MXCSR_DE;
15711
15712 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15713}
15714
15715#endif /* IEM_WITHOUT_ASSEMBLY */
15716
15717
15718/**
15719 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15720 * in MXCSR into account.
15721 *
15722 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15723 * @param pr32Val Where to store the result.
15724 * @param fMxcsr The input MXCSR value.
15725 * @param pr32Src The value to use.
15726 */
15727DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15728{
15729 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15730 {
15731 if (fMxcsr & X86_MXCSR_DAZ)
15732 {
15733 /* De-normals are changed to 0. */
15734 pr32Val->s.fSign = pr32Src->s.fSign;
15735 pr32Val->s.uFraction = 0;
15736 pr32Val->s.uExponent = 0;
15737 return 0;
15738 }
15739
15740 *pr32Val = *pr32Src;
15741 return X86_MXCSR_DE;
15742 }
15743
15744 *pr32Val = *pr32Src;
15745 return 0;
15746}
15747
15748
15749/**
15750 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15751 * in MXCSR into account.
15752 *
15753 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15754 * @param pr64Val Where to store the result.
15755 * @param fMxcsr The input MXCSR value.
15756 * @param pr64Src The value to use.
15757 */
15758DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15759{
15760 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15761 {
15762 if (fMxcsr & X86_MXCSR_DAZ)
15763 {
15764 /* De-normals are changed to 0. */
15765 pr64Val->s64.fSign = pr64Src->s.fSign;
15766 pr64Val->s64.uFraction = 0;
15767 pr64Val->s64.uExponent = 0;
15768 return 0;
15769 }
15770
15771 *pr64Val = *pr64Src;
15772 return X86_MXCSR_DE;
15773 }
15774
15775 *pr64Val = *pr64Src;
15776 return 0;
15777}
15778
15779#ifdef IEM_WITHOUT_ASSEMBLY
15780
15781/**
15782 * Validates the given input operands returning whether the operation can continue or whether one
15783 * of the source operands contains a NaN value, setting the output accordingly.
15784 *
15785 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15786 * @param pr32Res Where to store the result in case the operation can't continue.
15787 * @param pr32Val1 The first input operand.
15788 * @param pr32Val2 The second input operand.
15789 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15790 */
15791DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15792{
15793 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15794 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15795 if (cSNan + cQNan == 2)
15796 {
15797 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15798 *pr32Res = *pr32Val1;
15799 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15800 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15801 return true;
15802 }
15803 if (cSNan)
15804 {
15805 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15806 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15807 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15808 *pfMxcsr |= X86_MXCSR_IE;
15809 return true;
15810 }
15811 if (cQNan)
15812 {
15813 /* The QNan operand is placed into the result. */
15814 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15815 return true;
15816 }
15817
15818 Assert(!cQNan && !cSNan);
15819 return false;
15820}
15821
15822
15823/**
15824 * Validates the given double precision input operands returning whether the operation can continue or whether one
15825 * of the source operands contains a NaN value, setting the output accordingly.
15826 *
15827 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15828 * @param pr64Res Where to store the result in case the operation can't continue.
15829 * @param pr64Val1 The first input operand.
15830 * @param pr64Val2 The second input operand.
15831 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15832 */
15833DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15834{
15835 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15836 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15837 if (cSNan + cQNan == 2)
15838 {
15839 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15840 *pr64Res = *pr64Val1;
15841 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15842 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15843 return true;
15844 }
15845 if (cSNan)
15846 {
15847 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15848 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15849 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15850 *pfMxcsr |= X86_MXCSR_IE;
15851 return true;
15852 }
15853 if (cQNan)
15854 {
15855 /* The QNan operand is placed into the result. */
15856 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15857 return true;
15858 }
15859
15860 Assert(!cQNan && !cSNan);
15861 return false;
15862}
15863
15864
15865/**
15866 * Validates the given single input operand returning whether the operation can continue or whether
15867 * contains a NaN value, setting the output accordingly.
15868 *
15869 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15870 * @param pr32Res Where to store the result in case the operation can't continue.
15871 * @param pr32Val The input operand.
15872 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15873 */
15874DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15875{
15876 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15877 {
15878 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15879 *pr32Res = *pr32Val;
15880 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15881 *pfMxcsr |= X86_MXCSR_IE;
15882 return true;
15883 }
15884 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15885 {
15886 /* The QNan operand is placed into the result. */
15887 *pr32Res = *pr32Val;
15888 return true;
15889 }
15890
15891 return false;
15892}
15893
15894
15895/**
15896 * Validates the given double input operand returning whether the operation can continue or whether
15897 * contains a NaN value, setting the output accordingly.
15898 *
15899 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15900 * @param pr64Res Where to store the result in case the operation can't continue.
15901 * @param pr64Val The input operand.
15902 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15903 */
15904DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15905{
15906 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15907 {
15908 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15909 *pr64Res = *pr64Val;
15910 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15911 *pfMxcsr |= X86_MXCSR_IE;
15912 return true;
15913 }
15914 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15915 {
15916 /* The QNan operand is placed into the result. */
15917 *pr64Res = *pr64Val;
15918 return true;
15919 }
15920
15921 return false;
15922}
15923
15924#endif /* IEM_WITHOUT_ASSEMBLY */
15925
15926/**
15927 * ADDPS
15928 */
15929#ifdef IEM_WITHOUT_ASSEMBLY
15930static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15931{
15932 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15933 return fMxcsr;
15934
15935 RTFLOAT32U r32Src1, r32Src2;
15936 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15937 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15938 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15939 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15940 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15941}
15942
15943
15944IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15945{
15946 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15947 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15948 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15949 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15950}
15951#endif
15952
15953
15954/**
15955 * ADDSS
15956 */
15957#ifdef IEM_WITHOUT_ASSEMBLY
15958IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15959{
15960 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15961 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15962 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15963 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15964}
15965#endif
15966
15967
15968/**
15969 * ADDPD
15970 */
15971#ifdef IEM_WITHOUT_ASSEMBLY
15972static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15973{
15974 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15975 return fMxcsr;
15976
15977 RTFLOAT64U r64Src1, r64Src2;
15978 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15979 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15980 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15981 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15982 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15983}
15984
15985
15986IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15987{
15988 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15989 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15990}
15991#endif
15992
15993
15994/**
15995 * ADDSD
15996 */
15997#ifdef IEM_WITHOUT_ASSEMBLY
15998IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15999{
16000 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16001 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16002}
16003#endif
16004
16005
16006/**
16007 * MULPS
16008 */
16009#ifdef IEM_WITHOUT_ASSEMBLY
16010static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16011{
16012 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16013 return fMxcsr;
16014
16015 RTFLOAT32U r32Src1, r32Src2;
16016 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16017 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16018 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16019 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16020 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16021}
16022
16023
16024IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16025{
16026 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16027 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16028 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16029 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16030}
16031#endif
16032
16033
16034/**
16035 * MULSS
16036 */
16037#ifdef IEM_WITHOUT_ASSEMBLY
16038IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16039{
16040 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16041 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16042 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16043 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16044}
16045#endif
16046
16047
16048/**
16049 * MULPD
16050 */
16051#ifdef IEM_WITHOUT_ASSEMBLY
16052static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16053{
16054 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16055 return fMxcsr;
16056
16057 RTFLOAT64U r64Src1, r64Src2;
16058 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16059 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16060 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16061 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16062 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16063}
16064
16065
16066IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16067{
16068 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16069 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16070}
16071#endif
16072
16073
16074/**
16075 * MULSD
16076 */
16077#ifdef IEM_WITHOUT_ASSEMBLY
16078IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16079{
16080 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16081 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16082}
16083#endif
16084
16085
16086/**
16087 * SUBPS
16088 */
16089#ifdef IEM_WITHOUT_ASSEMBLY
16090static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16091{
16092 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16093 return fMxcsr;
16094
16095 RTFLOAT32U r32Src1, r32Src2;
16096 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16097 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16098 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16099 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16100 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16101}
16102
16103
16104IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16105{
16106 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16107 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16108 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16109 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16110}
16111#endif
16112
16113
16114/**
16115 * SUBSS
16116 */
16117#ifdef IEM_WITHOUT_ASSEMBLY
16118IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16119{
16120 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16121 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16122 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16123 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16124}
16125#endif
16126
16127
16128/**
16129 * SUBPD
16130 */
16131#ifdef IEM_WITHOUT_ASSEMBLY
16132static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16133{
16134 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16135 return fMxcsr;
16136
16137 RTFLOAT64U r64Src1, r64Src2;
16138 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16139 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16140 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16141 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16142 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16143}
16144
16145
16146IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16147{
16148 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16149 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16150}
16151#endif
16152
16153
16154/**
16155 * SUBSD
16156 */
16157#ifdef IEM_WITHOUT_ASSEMBLY
16158IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16159{
16160 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16161 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16162}
16163#endif
16164
16165
16166/**
16167 * MINPS
16168 */
16169#ifdef IEM_WITHOUT_ASSEMBLY
16170static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16171{
16172 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16173 {
16174 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16175 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16176 return fMxcsr | X86_MXCSR_IE;
16177 }
16178
16179 RTFLOAT32U r32Src1, r32Src2;
16180 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16181 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16182 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16183 {
16184 *pr32Res = r32Src2;
16185 return fMxcsr;
16186 }
16187
16188 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16189 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16190 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16191 fLe
16192 ? iemFpSoftF32FromIprt(&r32Src1)
16193 : iemFpSoftF32FromIprt(&r32Src2),
16194 pr32Res, fMxcsr);
16195}
16196
16197
16198IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16199{
16200 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16201 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16202 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16203 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16204}
16205#endif
16206
16207
16208/**
16209 * MINSS
16210 */
16211#ifdef IEM_WITHOUT_ASSEMBLY
16212IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16213{
16214 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16215 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16216 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16217 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16218}
16219#endif
16220
16221
16222/**
16223 * MINPD
16224 */
16225#ifdef IEM_WITHOUT_ASSEMBLY
16226static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16227{
16228 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16229 {
16230 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16231 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16232 return fMxcsr | X86_MXCSR_IE;
16233 }
16234
16235 RTFLOAT64U r64Src1, r64Src2;
16236 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16237 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16238 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16239 {
16240 *pr64Res = r64Src2;
16241 return fMxcsr;
16242 }
16243
16244 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16245 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16246 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16247 fLe
16248 ? iemFpSoftF64FromIprt(&r64Src1)
16249 : iemFpSoftF64FromIprt(&r64Src2),
16250 pr64Res, fMxcsr);
16251}
16252
16253
16254IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16255{
16256 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16257 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16258}
16259#endif
16260
16261
16262/**
16263 * MINSD
16264 */
16265#ifdef IEM_WITHOUT_ASSEMBLY
16266IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16267{
16268 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16269 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16270}
16271#endif
16272
16273
16274/**
16275 * DIVPS
16276 */
16277#ifdef IEM_WITHOUT_ASSEMBLY
16278static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16279{
16280 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16281 return fMxcsr;
16282
16283 RTFLOAT32U r32Src1, r32Src2;
16284 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16285 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16286 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16287 {
16288 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16289 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16290 {
16291 *pr32Res = g_ar32QNaN[1];
16292 return fMxcsr | X86_MXCSR_IE;
16293 }
16294 else if (RTFLOAT32U_IS_INF(&r32Src1))
16295 {
16296 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16297 return fMxcsr;
16298 }
16299 else
16300 {
16301 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16302 return fMxcsr | X86_MXCSR_ZE;
16303 }
16304 }
16305
16306 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16307 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16308 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16309}
16310
16311
16312IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16313{
16314 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16315 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16316 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16317 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16318}
16319#endif
16320
16321
16322/**
16323 * DIVSS
16324 */
16325#ifdef IEM_WITHOUT_ASSEMBLY
16326IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16327{
16328 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16329 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16330 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16331 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16332}
16333#endif
16334
16335
16336/**
16337 * DIVPD
16338 */
16339#ifdef IEM_WITHOUT_ASSEMBLY
16340static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16341{
16342 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16343 return fMxcsr;
16344
16345 RTFLOAT64U r64Src1, r64Src2;
16346 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16347 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16348 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16349 {
16350 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16351 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16352 {
16353 *pr64Res = g_ar64QNaN[1];
16354 return fMxcsr | X86_MXCSR_IE;
16355 }
16356 else if (RTFLOAT64U_IS_INF(&r64Src1))
16357 {
16358 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16359 return fMxcsr;
16360 }
16361 else
16362 {
16363 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16364 return fMxcsr | X86_MXCSR_ZE;
16365 }
16366 }
16367
16368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16369 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16370 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16371}
16372
16373
16374IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16375{
16376 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16377 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16378}
16379#endif
16380
16381
16382/**
16383 * DIVSD
16384 */
16385#ifdef IEM_WITHOUT_ASSEMBLY
16386IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16387{
16388 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16389 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16390}
16391#endif
16392
16393
16394/**
16395 * MAXPS
16396 */
16397#ifdef IEM_WITHOUT_ASSEMBLY
16398static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16399{
16400 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16401 {
16402 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16403 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16404 return fMxcsr | X86_MXCSR_IE;
16405 }
16406
16407 RTFLOAT32U r32Src1, r32Src2;
16408 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16409 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16410 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16411 {
16412 *pr32Res = r32Src2;
16413 return fMxcsr;
16414 }
16415
16416 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16417 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16418 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16419 fLe
16420 ? iemFpSoftF32FromIprt(&r32Src2)
16421 : iemFpSoftF32FromIprt(&r32Src1),
16422 pr32Res, fMxcsr);
16423}
16424
16425
16426IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16427{
16428 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16429 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16430 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16431 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16432}
16433#endif
16434
16435
16436/**
16437 * MAXSS
16438 */
16439#ifdef IEM_WITHOUT_ASSEMBLY
16440IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16441{
16442 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16443 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16444 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16445 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16446}
16447#endif
16448
16449
16450/**
16451 * MAXPD
16452 */
16453#ifdef IEM_WITHOUT_ASSEMBLY
16454static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16455{
16456 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16457 {
16458 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16459 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16460 return fMxcsr | X86_MXCSR_IE;
16461 }
16462
16463 RTFLOAT64U r64Src1, r64Src2;
16464 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16465 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16466 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16467 {
16468 *pr64Res = r64Src2;
16469 return fMxcsr;
16470 }
16471
16472 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16473 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16474 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16475 fLe
16476 ? iemFpSoftF64FromIprt(&r64Src2)
16477 : iemFpSoftF64FromIprt(&r64Src1),
16478 pr64Res, fMxcsr);
16479}
16480
16481
16482IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16483{
16484 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16485 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16486}
16487#endif
16488
16489
16490/**
16491 * MAXSD
16492 */
16493#ifdef IEM_WITHOUT_ASSEMBLY
16494IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16495{
16496 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16497 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16498}
16499#endif
16500
16501
16502/**
16503 * CVTSS2SD
16504 */
16505#ifdef IEM_WITHOUT_ASSEMBLY
16506static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16507{
16508 RTFLOAT32U r32Src1;
16509 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16510
16511 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16512 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16513 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16514}
16515
16516
16517IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16518{
16519 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
16520 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16521}
16522#endif
16523
16524
16525/**
16526 * CVTSD2SS
16527 */
16528#ifdef IEM_WITHOUT_ASSEMBLY
16529static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16530{
16531 RTFLOAT64U r64Src1;
16532 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16533
16534 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16535 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16536 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16537}
16538
16539
16540IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16541{
16542 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
16543 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16544 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16545 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16546}
16547#endif
16548
16549
16550/**
16551 * HADDPS
16552 */
16553#ifdef IEM_WITHOUT_ASSEMBLY
16554IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16555{
16556 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16557 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16558 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16559 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16560}
16561#endif
16562
16563
16564/**
16565 * HADDPD
16566 */
16567#ifdef IEM_WITHOUT_ASSEMBLY
16568IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16569{
16570 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16571 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16572}
16573#endif
16574
16575
16576/**
16577 * HSUBPS
16578 */
16579#ifdef IEM_WITHOUT_ASSEMBLY
16580IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16581{
16582 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16583 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16584 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16585 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16586}
16587#endif
16588
16589
16590/**
16591 * HSUBPD
16592 */
16593#ifdef IEM_WITHOUT_ASSEMBLY
16594IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16595{
16596 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16597 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16598}
16599#endif
16600
16601
16602/**
16603 * SQRTPS
16604 */
16605#ifdef IEM_WITHOUT_ASSEMBLY
16606static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16607{
16608 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16609 return fMxcsr;
16610
16611 RTFLOAT32U r32Src;
16612 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16613 if (RTFLOAT32U_IS_ZERO(&r32Src))
16614 {
16615 *pr32Res = r32Src;
16616 return fMxcsr;
16617 }
16618 else if (r32Src.s.fSign)
16619 {
16620 *pr32Res = g_ar32QNaN[1];
16621 return fMxcsr | X86_MXCSR_IE;
16622 }
16623
16624 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16625 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16626 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16627}
16628
16629
16630IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16631{
16632 RT_NOREF(puSrc1);
16633
16634 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16635 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16636 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16637 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16638}
16639#endif
16640
16641
16642/**
16643 * SQRTSS
16644 */
16645#ifdef IEM_WITHOUT_ASSEMBLY
16646IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16647{
16648 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16649 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16650 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16651 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16652}
16653#endif
16654
16655
16656/**
16657 * SQRTPD
16658 */
16659#ifdef IEM_WITHOUT_ASSEMBLY
16660static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16661{
16662 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16663 return fMxcsr;
16664
16665 RTFLOAT64U r64Src;
16666 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16667 if (RTFLOAT64U_IS_ZERO(&r64Src))
16668 {
16669 *pr64Res = r64Src;
16670 return fMxcsr;
16671 }
16672 else if (r64Src.s.fSign)
16673 {
16674 *pr64Res = g_ar64QNaN[1];
16675 return fMxcsr | X86_MXCSR_IE;
16676 }
16677
16678 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16679 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16680 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16681}
16682
16683
16684IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16685{
16686 RT_NOREF(puSrc1);
16687
16688 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16689 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16690}
16691#endif
16692
16693
16694/**
16695 * SQRTSD
16696 */
16697#ifdef IEM_WITHOUT_ASSEMBLY
16698IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16699{
16700 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
16701 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16702}
16703#endif
16704
16705
16706#ifdef IEM_WITHOUT_ASSEMBLY
16707/**
16708 * RSQRTPS
16709 */
16710static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16711{
16712 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16713 return fMxcsr;
16714
16715 RTFLOAT32U r32Src;
16716 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16717 if (RTFLOAT32U_IS_ZERO(&r32Src))
16718 {
16719 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16720 return fMxcsr;
16721 }
16722 else if (r32Src.s.fSign)
16723 {
16724 *pr32Res = g_ar32QNaN[1];
16725 return fMxcsr | X86_MXCSR_IE;
16726 }
16727
16728 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16729 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16730 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16731}
16732
16733
16734IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16735{
16736 RT_NOREF(puSrc1);
16737
16738 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16739 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16740 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16741 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16742}
16743
16744
16745/**
16746 * RSQRTSS
16747 */
16748IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16749{
16750 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16751 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16752 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16753 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16754}
16755#endif
16756
16757
16758/**
16759 * RCPPS
16760 */
16761#ifdef IEM_WITHOUT_ASSEMBLY
16762static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16763{
16764 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16765 return fMxcsr;
16766
16767 RTFLOAT32U r32Src;
16768 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16769 if (RTFLOAT32U_IS_ZERO(&r32Src))
16770 {
16771 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16772 return fMxcsr;
16773 }
16774
16775 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16776 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16777 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16778}
16779
16780
16781IEM_DECL_IMPL_DEF(void, iemAImpl_rcpps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16782{
16783 RT_NOREF(puSrc1);
16784
16785 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16786 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16787 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16788 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16789}
16790
16791
16792/**
16793 * RCPSS
16794 */
16795IEM_DECL_IMPL_DEF(void, iemAImpl_rcpss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16796{
16797 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16798 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16799 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16800 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16801}
16802#endif
16803
16804
16805/**
16806 * ADDSUBPS
16807 */
16808#ifdef IEM_WITHOUT_ASSEMBLY
16809IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16810{
16811 RT_NOREF(puSrc1);
16812
16813 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16814 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16815 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16816 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16817}
16818#endif
16819
16820
16821/**
16822 * ADDSUBPD
16823 */
16824#ifdef IEM_WITHOUT_ASSEMBLY
16825IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16826{
16827 RT_NOREF(puSrc1);
16828
16829 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16830 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16831}
16832#endif
16833
16834
16835/**
16836 * CVTPD2PS
16837 */
16838#ifdef IEM_WITHOUT_ASSEMBLY
16839static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16840{
16841 RTFLOAT64U r64Src1;
16842 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16843
16844 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16845 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16846 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16847}
16848
16849
16850IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16851{
16852 RT_NOREF(puSrc1);
16853
16854 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16855 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16856 pResult->uResult.au32[2] = 0;
16857 pResult->uResult.au32[3] = 0;
16858}
16859#endif
16860
16861
16862/**
16863 * CVTPS2PD
16864 */
16865#ifdef IEM_WITHOUT_ASSEMBLY
16866static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16867{
16868 RTFLOAT32U r32Src1;
16869 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16870
16871 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16872 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16873 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16874}
16875
16876
16877IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16878{
16879 RT_NOREF(puSrc1);
16880
16881 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16882 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16883}
16884#endif
16885
16886
16887/**
16888 * CVTDQ2PS
16889 */
16890#ifdef IEM_WITHOUT_ASSEMBLY
16891static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16892{
16893 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16894 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16895 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16896}
16897
16898
16899IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16900{
16901 RT_NOREF(puSrc1);
16902
16903 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16904 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16905 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16906 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16907}
16908#endif
16909
16910
16911/**
16912 * CVTPS2DQ
16913 */
16914#ifdef IEM_WITHOUT_ASSEMBLY
16915static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16916{
16917 RTFLOAT32U r32Src;
16918 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16919
16920 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16921 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16922 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16923}
16924
16925
16926IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16927{
16928 RT_NOREF(puSrc1);
16929
16930 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16931 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16932 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16933 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16934}
16935#endif
16936
16937
16938/**
16939 * CVTTPS2DQ
16940 */
16941#ifdef IEM_WITHOUT_ASSEMBLY
16942static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16943{
16944 RTFLOAT32U r32Src;
16945 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16946
16947 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16948 SoftState.roundingMode = softfloat_round_minMag;
16949 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16950 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16951}
16952
16953
16954IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16955{
16956 RT_NOREF(puSrc1);
16957
16958 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16959 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16960 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16961 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16962}
16963#endif
16964
16965
16966/**
16967 * CVTTPD2DQ
16968 */
16969#ifdef IEM_WITHOUT_ASSEMBLY
16970static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16971{
16972 RTFLOAT64U r64Src;
16973 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16974
16975 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16976 SoftState.roundingMode = softfloat_round_minMag;
16977 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16978 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16979}
16980
16981
16982IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16983{
16984 RT_NOREF(puSrc1);
16985
16986 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16987 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16988 pResult->uResult.au64[1] = 0;
16989}
16990#endif
16991
16992
16993/**
16994 * CVTDQ2PD
16995 */
16996#ifdef IEM_WITHOUT_ASSEMBLY
16997static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16998{
16999 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17000 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
17001 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
17002}
17003
17004
17005IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17006{
17007 RT_NOREF(puSrc1);
17008
17009 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
17010 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
17011}
17012#endif
17013
17014
17015/**
17016 * CVTPD2DQ
17017 */
17018#ifdef IEM_WITHOUT_ASSEMBLY
17019static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
17020{
17021 RTFLOAT64U r64Src;
17022 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
17023
17024 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17025 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17026 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17027}
17028
17029
17030IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17031{
17032 RT_NOREF(puSrc1);
17033
17034 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
17035 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
17036 pResult->uResult.au64[1] = 0;
17037}
17038#endif
17039
17040
17041/**
17042 * [V]SHUFPS
17043 */
17044#ifdef IEM_WITHOUT_ASSEMBLY
17045IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17046{
17047 RTUINT128U const uSrc1 = *puDst;
17048 RTUINT128U const uSrc2 = *puSrc;
17049 ASMCompilerBarrier();
17050 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17051 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17052 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17053 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17054}
17055#endif
17056
17057
17058IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17059{
17060 RTUINT128U const uSrc1 = *puSrc1;
17061 RTUINT128U const uSrc2 = *puSrc2;
17062 ASMCompilerBarrier();
17063 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17064 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17065 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17066 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17067}
17068
17069
17070IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17071{
17072 RTUINT256U const uSrc1 = *puSrc1;
17073 RTUINT256U const uSrc2 = *puSrc2;
17074 ASMCompilerBarrier();
17075 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17076 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17077 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17078 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17079
17080 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
17081 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
17082 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
17083 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
17084}
17085
17086
17087/**
17088 * [V]SHUFPD
17089 */
17090#ifdef IEM_WITHOUT_ASSEMBLY
17091IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17092{
17093 RTUINT128U const uSrc1 = *puDst;
17094 RTUINT128U const uSrc2 = *puSrc;
17095 ASMCompilerBarrier();
17096 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17097 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17098}
17099#endif
17100
17101
17102IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17103{
17104 RTUINT128U const uSrc1 = *puSrc1;
17105 RTUINT128U const uSrc2 = *puSrc2;
17106 ASMCompilerBarrier();
17107 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17108 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17109}
17110
17111
17112IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17113{
17114 RTUINT256U const uSrc1 = *puSrc1;
17115 RTUINT256U const uSrc2 = *puSrc2;
17116 ASMCompilerBarrier();
17117 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17118 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17119 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
17120 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
17121}
17122
17123
17124/*
17125 * PHMINPOSUW / VPHMINPOSUW
17126 */
17127IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17128{
17129 uint16_t u16Min = puSrc->au16[0];
17130 uint8_t idxMin = 0;
17131
17132 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
17133 if (puSrc->au16[i] < u16Min)
17134 {
17135 u16Min = puSrc->au16[i];
17136 idxMin = i;
17137 }
17138
17139 puDst->au64[0] = 0;
17140 puDst->au64[1] = 0;
17141 puDst->au16[0] = u16Min;
17142 puDst->au16[1] = idxMin;
17143}
17144
17145
17146IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17147{
17148 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
17149}
17150
17151
17152/**
17153 * VPERMILPS
17154 */
17155#ifdef IEM_WITHOUT_ASSEMBLY
17156IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17157{
17158 RTUINT128U const uSrc = *puSrc;
17159 ASMCompilerBarrier();
17160
17161 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17162 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17163 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17164 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17165}
17166
17167
17168IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17169{
17170 RTUINT256U const uSrc = *puSrc;
17171 ASMCompilerBarrier();
17172
17173 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17174 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17175 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17176 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17177
17178 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17179 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17180 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17181 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17182}
17183
17184IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17185{
17186 RTUINT128U const uSrc1 = *puSrc1;
17187 RTUINT128U const uSrc2 = *puSrc2;
17188 ASMCompilerBarrier();
17189
17190 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17191 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17192 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17193 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17194}
17195
17196IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17197{
17198 RTUINT256U const uSrc1 = *puSrc1;
17199 RTUINT256U const uSrc2 = *puSrc2;
17200 ASMCompilerBarrier();
17201
17202 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17203 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17204 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17205 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17206
17207 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17208 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17209 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17210 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17211}
17212#endif
17213
17214
17215IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17216{
17217 RTUINT128U const uSrc = *puSrc;
17218 ASMCompilerBarrier();
17219
17220 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17221 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17222 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17223 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17224}
17225
17226
17227IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17228{
17229 RTUINT256U const uSrc = *puSrc;
17230 ASMCompilerBarrier();
17231
17232 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17233 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17234 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17235 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17236
17237 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17238 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17239 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17240 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17241}
17242
17243IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17244{
17245 RTUINT128U const uSrc1 = *puSrc1;
17246 RTUINT128U const uSrc2 = *puSrc2;
17247 ASMCompilerBarrier();
17248
17249 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17250 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17251 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17252 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17253}
17254
17255IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17256{
17257 RTUINT256U const uSrc1 = *puSrc1;
17258 RTUINT256U const uSrc2 = *puSrc2;
17259 ASMCompilerBarrier();
17260
17261 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17262 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17263 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17264 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17265
17266 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17267 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17268 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17269 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17270}
17271
17272
17273/**
17274 * VPERMILPD
17275 */
17276#ifdef IEM_WITHOUT_ASSEMBLY
17277IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17278{
17279 RTUINT128U const uSrc = *puSrc;
17280 ASMCompilerBarrier();
17281
17282 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17283 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17284}
17285
17286
17287IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17288{
17289 RTUINT256U const uSrc = *puSrc;
17290 ASMCompilerBarrier();
17291
17292 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17293 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17294
17295 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17296 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17297}
17298
17299IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17300{
17301 RTUINT128U const uSrc1 = *puSrc1;
17302 RTUINT128U const uSrc2 = *puSrc2;
17303 ASMCompilerBarrier();
17304
17305 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17306 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17307}
17308
17309IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17310{
17311 RTUINT256U const uSrc1 = *puSrc1;
17312 RTUINT256U const uSrc2 = *puSrc2;
17313 ASMCompilerBarrier();
17314
17315 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17316 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17317
17318 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17319 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17320}
17321#endif
17322
17323
17324IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17325{
17326 RTUINT128U const uSrc = *puSrc;
17327 ASMCompilerBarrier();
17328
17329 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17330 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17331}
17332
17333
17334IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17335{
17336 RTUINT256U const uSrc = *puSrc;
17337 ASMCompilerBarrier();
17338
17339 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17340 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17341
17342 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17343 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17344}
17345
17346IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17347{
17348 RTUINT128U const uSrc1 = *puSrc1;
17349 RTUINT128U const uSrc2 = *puSrc2;
17350 ASMCompilerBarrier();
17351
17352 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17353 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17354}
17355
17356IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17357{
17358 RTUINT256U const uSrc1 = *puSrc1;
17359 RTUINT256U const uSrc2 = *puSrc2;
17360 ASMCompilerBarrier();
17361
17362 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17363 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17364
17365 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17366 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17367}
17368
17369
17370/*
17371 * [V]PBLENDVB
17372 */
17373IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17374{
17375 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17376 if (puMask->au8[i] & RT_BIT(7))
17377 puDst->au8[i] = puSrc->au8[i];
17378}
17379
17380
17381IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17382{
17383 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17384 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17385}
17386
17387
17388IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17389{
17390 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17391 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17392}
17393
17394
17395/*
17396 * [V]BLENDVPS
17397 */
17398IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17399{
17400 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17401 if (puMask->au32[i] & RT_BIT_32(31))
17402 puDst->au32[i] = puSrc->au32[i];
17403}
17404
17405
17406IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17407{
17408 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17409 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17410}
17411
17412
17413IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17414{
17415 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17416 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17417}
17418
17419
17420/*
17421 * [V]BLENDVPD
17422 */
17423IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17424{
17425 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17426 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17427}
17428
17429
17430IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17431{
17432 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17433 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17434}
17435
17436
17437IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17438{
17439 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17440 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17441}
17442
17443
17444/**
17445 * [V]PALIGNR
17446 */
17447IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17448{
17449 uint64_t const u64Src1 = *pu64Dst;
17450 ASMCompilerBarrier();
17451
17452 if (bEvil >= 16)
17453 *pu64Dst = 0;
17454 else if (bEvil >= 8)
17455 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17456 else
17457 {
17458 uint8_t cShift = bEvil * 8;
17459 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17460 | (u64Src2 >> cShift);
17461 }
17462}
17463
17464
17465IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17466{
17467 RTUINT128U const uSrc1 = *puDst;
17468 RTUINT128U const uSrc2 = *puSrc;
17469 ASMCompilerBarrier();
17470
17471 puDst->au64[0] = 0;
17472 puDst->au64[1] = 0;
17473 if (bEvil >= 32)
17474 { /* Everything stays 0. */ }
17475 else if (bEvil >= 16)
17476 {
17477 bEvil -= 16;
17478 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17479 puDst->au8[i - bEvil] = uSrc1.au8[i];
17480 }
17481 else
17482 {
17483 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17484 puDst->au8[i] = uSrc2.au8[i + bEvil];
17485 for (uint8_t i = 0; i < bEvil; i++)
17486 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17487 }
17488}
17489
17490
17491IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17492{
17493 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17494 RTUINT128U const uSrc2 = *puSrc2;
17495 ASMCompilerBarrier();
17496
17497 puDst->au64[0] = 0;
17498 puDst->au64[1] = 0;
17499 if (bEvil >= 32)
17500 { /* Everything stays 0. */ }
17501 else if (bEvil >= 16)
17502 {
17503 bEvil -= 16;
17504 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17505 puDst->au8[i - bEvil] = uSrc1.au8[i];
17506 }
17507 else
17508 {
17509 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17510 puDst->au8[i] = uSrc2.au8[i + bEvil];
17511 for (uint8_t i = 0; i < bEvil; i++)
17512 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17513 }
17514}
17515
17516
17517IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17518{
17519 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17520 RTUINT256U const uSrc2 = *puSrc2;
17521 ASMCompilerBarrier();
17522
17523 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17524 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17525}
17526
17527
17528/**
17529 * [V]PBLENDW
17530 */
17531IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17532{
17533 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17534 if (bEvil & RT_BIT(i))
17535 puDst->au16[i] = puSrc->au16[i];
17536}
17537
17538
17539IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17540{
17541 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17542 if (bEvil & RT_BIT(i))
17543 puDst->au16[i] = puSrc2->au16[i];
17544 else
17545 puDst->au16[i] = puSrc1->au16[i];
17546}
17547
17548
17549IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17550{
17551 for (uint8_t i = 0; i < 8; i++)
17552 if (bEvil & RT_BIT(i))
17553 {
17554 puDst->au16[ i] = puSrc2->au16[ i];
17555 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17556 }
17557 else
17558 {
17559 puDst->au16[ i] = puSrc1->au16[ i];
17560 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17561 }
17562}
17563
17564
17565/**
17566 * [V]PBLENDD
17567 */
17568IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17569{
17570 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17571 if (bEvil & RT_BIT(i))
17572 puDst->au32[i] = puSrc2->au32[i];
17573 else
17574 puDst->au32[i] = puSrc1->au32[i];
17575}
17576
17577
17578IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17579{
17580 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17581 if (bEvil & RT_BIT(i))
17582 puDst->au32[i] = puSrc2->au32[i];
17583 else
17584 puDst->au32[i] = puSrc1->au32[i];
17585}
17586
17587
17588/**
17589 * [V]BLENDPS
17590 */
17591IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17592{
17593 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17594 if (bEvil & RT_BIT(i))
17595 puDst->au32[i] = puSrc->au32[i];
17596}
17597
17598
17599IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17600{
17601 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17602 if (bEvil & RT_BIT(i))
17603 puDst->au32[i] = puSrc2->au32[i];
17604 else
17605 puDst->au32[i] = puSrc1->au32[i];
17606}
17607
17608
17609IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17610{
17611 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17612 if (bEvil & RT_BIT(i))
17613 puDst->au32[i] = puSrc2->au32[i];
17614 else
17615 puDst->au32[i] = puSrc1->au32[i];
17616}
17617
17618
17619/**
17620 * [V]BLENDPD
17621 */
17622IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17623{
17624 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17625 if (bEvil & RT_BIT(i))
17626 puDst->au64[i] = puSrc->au64[i];
17627}
17628
17629
17630IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17631{
17632 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17633 if (bEvil & RT_BIT(i))
17634 puDst->au64[i] = puSrc2->au64[i];
17635 else
17636 puDst->au64[i] = puSrc1->au64[i];
17637}
17638
17639
17640IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17641{
17642 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17643 if (bEvil & RT_BIT(i))
17644 puDst->au64[i] = puSrc2->au64[i];
17645 else
17646 puDst->au64[i] = puSrc1->au64[i];
17647}
17648
17649
17650/**
17651 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17652 */
17653
17654static uint8_t iemAImpl_aes_sbox[] = {
17655 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17656 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17657 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17658 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17659 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17660 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17661 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17662 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17663 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17664 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17665 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17666 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17667 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17668 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17669 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17670 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17671};
17672
17673/* The InvS-Box lookup table. */
17674static uint8_t iemAImpl_aes_inv_sbox[] = {
17675 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17676 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17677 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17678 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17679 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17680 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17681 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17682 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17683 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17684 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17685 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17686 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17687 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17688 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17689 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17690 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17691};
17692
17693/* The ShiftRows lookup table. */
17694static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17695 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17696};
17697
17698/* The InvShiftRows lookup table. */
17699static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17700 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17701};
17702
17703static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17704{
17705 RTUINT128U uVal;
17706 int i;
17707
17708 for (i = 0; i < 16; ++i)
17709 uVal.au8[i] = abSubst[puSrc->au8[i]];
17710
17711 return uVal;
17712}
17713
17714static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17715{
17716 return (u << 1) ^ (((u >> 7) & 1) * 27);
17717}
17718
17719static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17720{
17721 RTUINT128U uVal;
17722 int i;
17723 uint8_t tmp;
17724
17725 for (i = 0; i < 16; i += 4) {
17726 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17727 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17728 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17729 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17730 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17731 }
17732
17733 return uVal;
17734}
17735
17736static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17737{
17738 RTUINT128U uVal;
17739 int i;
17740
17741 for (i = 0; i < 16; ++i)
17742 uVal.au8[i] = puSrc->au8[abShift[i]];
17743
17744 return uVal;
17745}
17746
17747static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17748{
17749 uint8_t val;
17750
17751 val = ((b >> 0) & 1) * a;
17752 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17753 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17754 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17755 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17756
17757 return val;
17758}
17759
17760static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17761{
17762 RTUINT128U uVal;
17763 int i;
17764
17765 for (i = 0; i < 16; i += 4) {
17766 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17767 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17768 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17769 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17770 }
17771
17772 return uVal;
17773}
17774
17775static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17776{
17777 RTUINT32U uTmp;
17778
17779 uTmp.au32[0] = w;
17780 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17781 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17782 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17783 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17784
17785 return uTmp.au32[0];
17786}
17787
17788static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17789{
17790 return (w << 24) | (w >> 8);
17791}
17792
17793/**
17794 * [V]AESKEYGENASSIST
17795 */
17796IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17797{
17798 RTUINT128U uTmp;
17799 uint32_t uRCon = bImm; /* Round constant. */
17800
17801 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17802 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17803 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17804 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17805
17806 *puDst = uTmp;
17807}
17808
17809
17810/**
17811 * [V]AESIMC
17812 */
17813IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17814{
17815 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17816}
17817
17818
17819/**
17820 * [V]AESENC
17821 */
17822IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17823{
17824 RTUINT128U uTmp;
17825
17826 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17827 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17828 uTmp = iemAImpl_aes_mix_col(&uTmp);
17829 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17830 uTmp.au64[1] ^= puSrc->au64[1];
17831
17832 *puDst = uTmp;
17833}
17834
17835
17836/**
17837 * [V]AESENCLAST
17838 */
17839IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17840{
17841 RTUINT128U uTmp;
17842
17843 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17844 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17845 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17846 uTmp.au64[1] ^= puSrc->au64[1];
17847
17848 *puDst = uTmp;
17849}
17850
17851
17852/**
17853 * [V]AESDEC
17854 */
17855IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17856{
17857 RTUINT128U uTmp;
17858
17859 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17860 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17861 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17862 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17863 uTmp.au64[1] ^= puSrc->au64[1];
17864
17865 *puDst = uTmp;
17866}
17867
17868
17869/**
17870 * [V]AESDECLAST
17871 */
17872IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17873{
17874 RTUINT128U uTmp;
17875
17876 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17877 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17878 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17879 uTmp.au64[1] ^= puSrc->au64[1];
17880
17881 *puDst = uTmp;
17882}
17883
17884
17885/**
17886 * [V]PCMPISTRI
17887 */
17888
17889/**
17890 * Does the comparisons based on the mode and source input format.
17891 */
17892static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17893{
17894#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17895 do \
17896 { \
17897 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17898 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17899 { \
17900 switch (a_bAggOp) \
17901 { \
17902 case 0: \
17903 case 2: \
17904 case 3: \
17905 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17906 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17907 break; \
17908 case 1: \
17909 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17910 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17911 break; \
17912 default: \
17913 AssertReleaseFailed(); \
17914 } \
17915 } \
17916 } while(0)
17917
17918 uint8_t bAggOp = (bImm >> 2) & 0x3;
17919 switch (bImm & 0x3)
17920 {
17921 case 0:
17922 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17923 break;
17924 case 1:
17925 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17926 break;
17927 case 2:
17928 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17929 break;
17930 case 3:
17931 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17932 break;
17933 default:
17934 AssertReleaseFailed();
17935 }
17936#undef PCMPXSTRX_CMP_CASE
17937}
17938
17939static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17940{
17941 if (bImm & 0x1)
17942 {
17943 /* Words -> 8 elements. */
17944 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17945 if (puSrc->au16[i] == 0)
17946 return i;
17947
17948 return 8;
17949 }
17950 else
17951 {
17952 /* Bytes -> 16 elements. */
17953 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17954 if (puSrc->au8[i] == 0)
17955 return i;
17956
17957 return 16;
17958 }
17959}
17960
17961static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17962{
17963 if (bImm & 0x1)
17964 {
17965 if (i64Len > -8 && i64Len < 8)
17966 return RT_ABS(i64Len);
17967
17968 return 8;
17969 }
17970 else
17971 {
17972 if (i64Len > -16 && i64Len < 16)
17973 return RT_ABS(i64Len);
17974
17975 return 16;
17976 }
17977}
17978
17979/**
17980 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17981 */
17982static const bool g_afCmpOverride[4][4] =
17983{
17984 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17985 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17986 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17987 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17988 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17989};
17990
17991DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17992{
17993 if (fSrc1Valid && fSrc2Valid)
17994 return fCmpRes;
17995
17996 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17997 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17998 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17999}
18000
18001static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
18002{
18003 uint8_t bAggOp = (bImm >> 2) & 0x3;
18004 uint16_t u16Result = 0;
18005
18006 switch (bAggOp)
18007 {
18008 case 0: /* Equal any */
18009 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18010 {
18011 uint16_t u16Res = 0;
18012 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
18013 {
18014 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
18015 idxSrc1 < idxLen1,
18016 idxSrc2 < idxLen2,
18017 bAggOp))
18018 {
18019 u16Res = RT_BIT(idxSrc2);
18020 break;
18021 }
18022 }
18023
18024 u16Result |= u16Res;
18025 }
18026 break;
18027
18028 case 1: /* Ranges */
18029 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18030 {
18031 uint16_t u16Res = 0;
18032 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
18033 {
18034 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
18035 idxSrc1 < idxLen1,
18036 idxSrc2 < idxLen2,
18037 bAggOp)
18038 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
18039 (idxSrc1 + 1) < idxLen1,
18040 idxSrc2 < idxLen2,
18041 bAggOp))
18042 {
18043 u16Res = RT_BIT(idxSrc2);
18044 break;
18045 }
18046 }
18047
18048 u16Result |= u16Res;
18049 }
18050 break;
18051
18052 case 2: /* Equal each */
18053 for (uint8_t i = 0; i < cElems; i++)
18054 {
18055 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
18056 i < idxLen1,
18057 i < idxLen2,
18058 bAggOp))
18059 u16Result |= RT_BIT(i);
18060 }
18061 break;
18062
18063 case 3: /* Equal ordered */
18064 u16Result = 0;
18065 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18066 {
18067 uint16_t u16Res = RT_BIT(idxSrc2);
18068 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
18069 {
18070 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
18071 idxSrc1 < idxLen1,
18072 k < idxLen2,
18073 bAggOp))
18074 {
18075 u16Res = 0;
18076 break;
18077 }
18078 }
18079
18080 u16Result |= u16Res;
18081 }
18082 break;
18083 }
18084
18085 /* Polarity selection. */
18086 switch ((bImm >> 4) & 0x3)
18087 {
18088 case 0:
18089 case 2:
18090 /* Nothing to do. */
18091 break;
18092 case 1:
18093 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
18094 break;
18095 case 3:
18096 u16Result ^= RT_BIT(idxLen2) - 1;
18097 break;
18098 default:
18099 AssertReleaseFailed();
18100 }
18101
18102 return u16Result;
18103}
18104
18105DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
18106{
18107 uint32_t fEFlags = 0;
18108
18109 if (u16Result)
18110 fEFlags |= X86_EFL_CF;
18111 if (cLen2 < cElems)
18112 fEFlags |= X86_EFL_ZF;
18113 if (cLen1 < cElems)
18114 fEFlags |= X86_EFL_SF;
18115 if (u16Result & 0x1)
18116 fEFlags |= X86_EFL_OF;
18117 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
18118}
18119
18120DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
18121 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
18122{
18123 bool afCmpRes[16][16];
18124 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18125
18126 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
18127 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
18128 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
18129
18130 return u16Result;
18131}
18132
18133DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18134{
18135 if (bImm & RT_BIT(6))
18136 {
18137 /* Index for MSB set. */
18138 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
18139 if (idxMsb)
18140 *pu32Ecx = idxMsb - 1;
18141 else
18142 *pu32Ecx = cElems;
18143 }
18144 else
18145 {
18146 /* Index for LSB set. */
18147 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
18148 if (idxLsb)
18149 *pu32Ecx = idxLsb - 1;
18150 else
18151 *pu32Ecx = cElems;
18152 }
18153}
18154
18155IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18156{
18157 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18158 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18159 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18160
18161 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18162 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18163}
18164
18165
18166/**
18167 * [V]PCMPESTRI
18168 */
18169IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18170{
18171 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18172 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18173 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18174
18175 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18176 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18177}
18178
18179
18180/**
18181 * [V]PCMPISTRM
18182 */
18183DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18184{
18185 if (bImm & RT_BIT(6))
18186 {
18187 /* Generate a mask. */
18188 if (cElems == 8)
18189 {
18190 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18191 if (u16Result & RT_BIT(i))
18192 puDst->au16[i] = 0xffff;
18193 else
18194 puDst->au16[i] = 0;
18195 }
18196 else
18197 {
18198 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18199 if (u16Result & RT_BIT(i))
18200 puDst->au8[i] = 0xff;
18201 else
18202 puDst->au8[i] = 0;
18203 }
18204 }
18205 else
18206 {
18207 /* Store the result. */
18208 puDst->au64[0] = u16Result;
18209 puDst->au64[1] = 0;
18210 }
18211}
18212
18213IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18214{
18215 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18216 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18217 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18218
18219 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18220 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18221}
18222
18223
18224/**
18225 * [V]PCMPESTRM
18226 */
18227IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18228{
18229 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18230 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18231 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18232
18233 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18234 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18235}
18236
18237
18238/*
18239 * [V]PCLMULQDQ
18240 */
18241IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18242{
18243 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18244}
18245
18246
18247IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18248{
18249 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18250 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18251
18252 puDst->au64[0] = 0;
18253 puDst->au64[1] = 0;
18254
18255 /*
18256 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18257 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18258 * and squeeze out some optimizations.
18259 */
18260 if (uSrc1 & 0x1)
18261 puDst->au64[0] = uSrc2;
18262
18263 uSrc1 >>= 1;
18264
18265 uint8_t iDigit = 1;
18266 while (uSrc1)
18267 {
18268 if (uSrc1 & 0x1)
18269 {
18270 puDst->au64[0] ^= (uSrc2 << iDigit);
18271 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18272 }
18273
18274 uSrc1 >>= 1;
18275 iDigit++;
18276 }
18277}
18278
18279
18280/**
18281 * [V]MOVMSKPS
18282 */
18283#ifdef IEM_WITHOUT_ASSEMBLY
18284IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18285{
18286 *pu8Dst = puSrc->au32[0] >> 31;
18287 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18288 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18289 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18290}
18291
18292#endif
18293
18294IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18295{
18296 *pu8Dst = puSrc->au32[0] >> 31;
18297 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18298 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18299 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18300}
18301
18302
18303IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18304{
18305 *pu8Dst = puSrc->au32[0] >> 31;
18306 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18307 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18308 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18309 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18310 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18311 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18312 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18313}
18314
18315
18316/**
18317 * [V]MOVMSKPD
18318 */
18319#ifdef IEM_WITHOUT_ASSEMBLY
18320IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18321{
18322 *pu8Dst = puSrc->au64[0] >> 63;
18323 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18324}
18325
18326#endif
18327
18328IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18329{
18330 *pu8Dst = puSrc->au64[0] >> 63;
18331 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18332}
18333
18334
18335IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18336{
18337 *pu8Dst = puSrc->au64[0] >> 63;
18338 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18339 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18340 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18341}
18342
18343
18344/**
18345 * CVTTSD2SI
18346 */
18347#ifdef IEM_WITHOUT_ASSEMBLY
18348IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18349{
18350 RTFLOAT64U r64Src;
18351
18352 r64Src.u = *pu64Src;
18353 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18354
18355 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18356 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18357 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18358}
18359
18360
18361IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18362{
18363 RTFLOAT64U r64Src;
18364
18365 r64Src.u = *pu64Src;
18366 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18367
18368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18369 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18370 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18371}
18372#endif
18373
18374
18375/**
18376 * CVTSD2SI
18377 */
18378#ifdef IEM_WITHOUT_ASSEMBLY
18379IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18380{
18381 RTFLOAT64U r64Src;
18382
18383 r64Src.u = *pu64Src;
18384 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18385
18386 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18387 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18388 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18389}
18390
18391
18392IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18393{
18394 RTFLOAT64U r64Src;
18395
18396 r64Src.u = *pu64Src;
18397 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18398
18399 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18400 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18401 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18402}
18403#endif
18404
18405
18406/**
18407 * CVTTSS2SI
18408 */
18409#ifdef IEM_WITHOUT_ASSEMBLY
18410IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18411{
18412 RTFLOAT32U r32Src;
18413
18414 r32Src.u = *pu32Src;
18415 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18416
18417 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18418 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18419 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18420}
18421
18422
18423IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18424{
18425 RTFLOAT32U r32Src;
18426
18427 r32Src.u = *pu32Src;
18428 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18429
18430 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18431 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18432 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18433}
18434#endif
18435
18436
18437/**
18438 * CVTSS2SI
18439 */
18440#ifdef IEM_WITHOUT_ASSEMBLY
18441IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18442{
18443 RTFLOAT32U r32Src;
18444
18445 r32Src.u = *pu32Src;
18446 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18447
18448 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18449 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18450 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18451}
18452
18453
18454IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18455{
18456 RTFLOAT32U r32Src;
18457
18458 r32Src.u = *pu32Src;
18459 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18460
18461 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18462 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18463 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18464}
18465#endif
18466
18467
18468/**
18469 * CVTSI2SD
18470 */
18471#ifdef IEM_WITHOUT_ASSEMBLY
18472IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18473{
18474 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18475 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18476 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18477}
18478
18479
18480IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18481{
18482 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18483 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18484 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18485}
18486#endif
18487
18488
18489/**
18490 * CVTSI2SS
18491 */
18492#ifdef IEM_WITHOUT_ASSEMBLY
18493IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18494{
18495 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18496 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18497 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18498}
18499
18500
18501IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18502{
18503 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18504 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18505 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18506}
18507#endif
18508
18509
18510/**
18511 * [V]UCOMISS
18512 */
18513#ifdef IEM_WITHOUT_ASSEMBLY
18514IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18515{
18516 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18517
18518 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
18519 {
18520 *pfMxcsr |= X86_MXCSR_IE;
18521 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18522 }
18523 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18524 {
18525 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18526 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18527 }
18528 else
18529 {
18530 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18531
18532 RTFLOAT32U r32Src1, r32Src2;
18533 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
18534 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18535
18536 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18537 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18538 if (f32_eq(f32Src1, f32Src2, &SoftState))
18539 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18540 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18541 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18542 /* else: GREATER_THAN 000 */
18543
18544 *pfMxcsr |= fDe;
18545 }
18546
18547 *pfEFlags = fEFlagsNew;
18548}
18549#endif
18550
18551IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18552{
18553 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18554}
18555
18556
18557/**
18558 * [V]UCOMISD
18559 */
18560#ifdef IEM_WITHOUT_ASSEMBLY
18561IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18562{
18563 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18564
18565 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
18566 {
18567 *pfMxcsr |= X86_MXCSR_IE;
18568 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18569 }
18570 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18571 {
18572 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18573 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18574 }
18575 else
18576 {
18577 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18578
18579 RTFLOAT64U r64Src1, r64Src2;
18580 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0])
18581 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18582
18583 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18584 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18585 if (f64_eq(f64Src1, f64Src2, &SoftState))
18586 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18587 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18588 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18589 /* else: GREATER_THAN 000 */
18590
18591 *pfMxcsr |= fDe;
18592 }
18593
18594 *pfEFlags = fEFlagsNew;
18595}
18596#endif
18597
18598IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18599{
18600 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18601}
18602
18603
18604/**
18605 * [V]COMISS
18606 */
18607#ifdef IEM_WITHOUT_ASSEMBLY
18608IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18609{
18610 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18611
18612 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
18613 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18614 {
18615 *pfMxcsr |= X86_MXCSR_IE;
18616 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18617 }
18618 else
18619 {
18620 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18621
18622 RTFLOAT32U r32Src1, r32Src2;
18623 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0])
18624 | iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18625
18626 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18627 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18628 if (f32_eq(f32Src1, f32Src2, &SoftState))
18629 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18630 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18631 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18632 /* else: GREATER_THAN 000 */
18633
18634 *pfMxcsr |= fDe;
18635 }
18636
18637 *pfEFlags = fEFlagsNew;
18638}
18639#endif
18640
18641
18642IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18643{
18644 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18645}
18646
18647
18648/**
18649 * [V]COMISD
18650 */
18651#ifdef IEM_WITHOUT_ASSEMBLY
18652IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18653{
18654 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18655
18656 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
18657 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18658 {
18659 *pfMxcsr |= X86_MXCSR_IE;
18660 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18661 }
18662 else
18663 {
18664 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18665
18666 RTFLOAT64U r64Src1, r64Src2;
18667 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
18668 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18669
18670 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18671 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18672 if (f64_eq(f64Src1, f64Src2, &SoftState))
18673 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18674 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18675 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18676 /* else: GREATER_THAN 000 */
18677
18678 *pfMxcsr |= fDe;
18679 }
18680
18681 *pfEFlags = fEFlagsNew;
18682}
18683#endif
18684
18685IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18686{
18687 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18688}
18689
18690
18691/**
18692 * CMPPS / CMPPD / CMPSS / CMPSD
18693 */
18694#ifdef IEM_WITHOUT_ASSEMBLY
18695/**
18696 * A compare truth table entry.
18697 */
18698typedef struct CMPTRUTHTBLENTRY
18699{
18700 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18701 bool fSignalsOnQNan;
18702 /** The boolean result when the input operands are unordered. */
18703 bool fUnordered;
18704 /** The boolean result when A = B. */
18705 bool fEqual;
18706 /** The boolean result when A < B. */
18707 bool fLowerThan;
18708 /** The boolean result when A > B. */
18709 bool fGreaterThan;
18710} CMPTRUTHTBLENTRY;
18711/** Pointer to a const truth table entry. */
18712typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18713
18714
18715/** The compare truth table (indexed by immediate). */
18716static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18717{
18718 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18719 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18720 /* 01H (LT_OS) */ { true, false, false, true, false },
18721 /* 02H (LE_OS) */ { true, false, true, true, false },
18722 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18723 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18724 /* 05H (NLT_US) */ { true, true, true, false, true },
18725 /* 06H (NLE_US) */ { true, true, false, false, true },
18726 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18727 /** @todo AVX variants. */
18728};
18729
18730
18731static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18732{
18733 bool fRes;
18734 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18735
18736 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18737 {
18738 *pfMxcsr |= X86_MXCSR_IE;
18739 fRes = g_aCmpTbl[bEvil].fUnordered;
18740 }
18741 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18742 {
18743 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18744 *pfMxcsr |= X86_MXCSR_IE;
18745 fRes = g_aCmpTbl[bEvil].fUnordered;
18746 }
18747 else
18748 {
18749 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18750
18751 RTFLOAT32U r32Src1, r32Src2;
18752 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18753 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18754
18755 *pfMxcsr |= fDe;
18756 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18757 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18758 if (f32_eq(f32Src1, f32Src2, &SoftState))
18759 fRes = g_aCmpTbl[bEvil].fEqual;
18760 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18761 fRes = g_aCmpTbl[bEvil].fLowerThan;
18762 else
18763 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18764 }
18765
18766 return fRes;
18767}
18768
18769
18770static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18771{
18772 bool fRes;
18773 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18774
18775 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18776 {
18777 *pfMxcsr |= X86_MXCSR_IE;
18778 fRes = g_aCmpTbl[bEvil].fUnordered;
18779 }
18780 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18781 {
18782 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18783 *pfMxcsr |= X86_MXCSR_IE;
18784 fRes = g_aCmpTbl[bEvil].fUnordered;
18785 }
18786 else
18787 {
18788 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18789
18790 RTFLOAT64U r64Src1, r64Src2;
18791 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18792 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18793
18794 *pfMxcsr |= fDe;
18795 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18796 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18797 if (f64_eq(f64Src1, f64Src2, &SoftState))
18798 fRes = g_aCmpTbl[bEvil].fEqual;
18799 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18800 fRes = g_aCmpTbl[bEvil].fLowerThan;
18801 else
18802 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18803 }
18804
18805 return fRes;
18806}
18807
18808
18809IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18810{
18811 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18812 {
18813 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18814 puDst->au32[i] = UINT32_MAX;
18815 else
18816 puDst->au32[i] = 0;
18817 }
18818}
18819
18820
18821IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18822{
18823 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18824 {
18825 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18826 puDst->au64[i] = UINT64_MAX;
18827 else
18828 puDst->au64[i] = 0;
18829 }
18830}
18831
18832
18833IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18834{
18835 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18836 puDst->au32[0] = UINT32_MAX;
18837 else
18838 puDst->au32[0] = 0;
18839
18840 puDst->au32[1] = pSrc->uSrc1.au32[1];
18841 puDst->au64[1] = pSrc->uSrc1.au64[1];
18842}
18843
18844
18845IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18846{
18847 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18848 puDst->au64[0] = UINT64_MAX;
18849 else
18850 puDst->au64[0] = 0;
18851
18852 puDst->au64[1] = pSrc->uSrc1.au64[1];
18853}
18854#endif
18855
18856
18857/**
18858 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18859 */
18860
18861#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18862#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18863#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18864
18865#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18866
18867DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18868{
18869 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18870 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18871
18872 fMxcsr &= ~X86_MXCSR_RC_MASK;
18873 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18874 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18875}
18876
18877static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18878{
18879 RTFLOAT32U r32Src, r32Dst;
18880 float32_t f32Src;
18881 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18882 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18883
18884 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18885 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18886
18887 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18888 return r32Dst;
18889}
18890
18891static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18892{
18893 RTFLOAT64U r64Src, r64Dst;
18894 float64_t f64Src;
18895 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18896 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18897
18898 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18899 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18900
18901 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18902 return r64Dst;
18903}
18904
18905#ifdef IEM_WITHOUT_ASSEMBLY
18906IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18907{
18908 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18909 puDst->au32[1] = pSrc->uSrc1.au32[1];
18910 puDst->au64[1] = pSrc->uSrc1.au64[1];
18911}
18912
18913
18914IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18915{
18916 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18917 puDst->au64[1] = pSrc->uSrc1.au64[1];
18918}
18919#endif
18920
18921IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18922{
18923 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18924 {
18925 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18926 }
18927}
18928
18929
18930IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18931{
18932 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18933 {
18934 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18935 }
18936}
18937
18938/**
18939 * CVTPD2PI
18940 */
18941#ifdef IEM_WITHOUT_ASSEMBLY
18942static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18943{
18944 RTFLOAT64U r64Src;
18945 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18946
18947 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18948 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18949 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18950}
18951
18952
18953IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18954{
18955 RTUINT64U u64Res;
18956 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18957 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18958
18959 *pu64Dst = u64Res.u;
18960 *pfMxcsr = fMxcsrOut;
18961}
18962#endif
18963
18964
18965/**
18966 * CVTTPD2PI
18967 */
18968#ifdef IEM_WITHOUT_ASSEMBLY
18969static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18970{
18971 RTFLOAT64U r64Src;
18972 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18973
18974 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18975 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18976 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18977}
18978
18979
18980IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18981{
18982 RTUINT64U u64Res;
18983 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18984 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18985
18986 *pu64Dst = u64Res.u;
18987 *pfMxcsr = fMxcsrOut;
18988}
18989#endif
18990
18991
18992/**
18993 * CVTPI2PS
18994 */
18995#ifdef IEM_WITHOUT_ASSEMBLY
18996static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18997{
18998 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18999 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
19000 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
19001}
19002
19003
19004IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
19005{
19006 RTUINT64U uSrc = { u64Src };
19007 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
19008 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
19009 *pfMxcsr = fMxcsrOut;
19010}
19011#endif
19012
19013
19014/**
19015 * CVTPI2PD
19016 */
19017#ifdef IEM_WITHOUT_ASSEMBLY
19018static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
19019{
19020 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19021 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
19022 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
19023}
19024
19025
19026IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
19027{
19028 RTUINT64U uSrc = { u64Src };
19029 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
19030 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
19031 *pfMxcsr = fMxcsrOut;
19032}
19033#endif
19034
19035
19036/**
19037 * CVTPS2PI
19038 */
19039#ifdef IEM_WITHOUT_ASSEMBLY
19040static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19041{
19042 RTFLOAT32U r32Src;
19043 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19044
19045 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19046 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
19047 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19048}
19049
19050
19051IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
19052{
19053 RTUINT64U uDst;
19054 RTUINT64U uSrc = { u64Src };
19055 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19056 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19057 *pu64Dst = uDst.u;
19058 *pfMxcsr = fMxcsrOut;
19059}
19060#endif
19061
19062
19063/**
19064 * CVTTPS2PI
19065 */
19066#ifdef IEM_WITHOUT_ASSEMBLY
19067static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19068{
19069 RTFLOAT32U r32Src;
19070 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19071
19072 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19073 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
19074 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19075}
19076
19077
19078IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
19079{
19080 RTUINT64U uDst;
19081 RTUINT64U uSrc = { u64Src };
19082 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19083 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19084 *pu64Dst = uDst.u;
19085 *pfMxcsr = fMxcsrOut;
19086}
19087#endif
19088
19089/**
19090 * RDRAND
19091 */
19092IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19093{
19094 *puDst = 0;
19095 *pEFlags &= ~X86_EFL_STATUS_BITS;
19096 *pEFlags |= X86_EFL_CF;
19097}
19098
19099IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19100{
19101 *puDst = 0;
19102 *pEFlags &= ~X86_EFL_STATUS_BITS;
19103 *pEFlags |= X86_EFL_CF;
19104}
19105
19106IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19107{
19108 *puDst = 0;
19109 *pEFlags &= ~X86_EFL_STATUS_BITS;
19110 *pEFlags |= X86_EFL_CF;
19111}
19112
19113/**
19114 * RDSEED
19115 */
19116IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19117{
19118 *puDst = 0;
19119 *pEFlags &= ~X86_EFL_STATUS_BITS;
19120 *pEFlags |= X86_EFL_CF;
19121}
19122
19123IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19124{
19125 *puDst = 0;
19126 *pEFlags &= ~X86_EFL_STATUS_BITS;
19127 *pEFlags |= X86_EFL_CF;
19128}
19129
19130IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19131{
19132 *puDst = 0;
19133 *pEFlags &= ~X86_EFL_STATUS_BITS;
19134 *pEFlags |= X86_EFL_CF;
19135}
19136
19137
19138/**
19139 * SHA1NEXTE
19140 */
19141IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19142{
19143 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19144
19145 puDst->au32[0] = puSrc->au32[0];
19146 puDst->au32[1] = puSrc->au32[1];
19147 puDst->au32[2] = puSrc->au32[2];
19148 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19149}
19150
19151/**
19152 * SHA1MSG1
19153 */
19154IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19155{
19156 uint32_t u32W0 = puDst->au32[3];
19157 uint32_t u32W1 = puDst->au32[2];
19158 uint32_t u32W2 = puDst->au32[1];
19159 uint32_t u32W3 = puDst->au32[0];
19160 uint32_t u32W4 = puSrc->au32[3];
19161 uint32_t u32W5 = puSrc->au32[2];
19162
19163 puDst->au32[3] = u32W2 ^ u32W0;
19164 puDst->au32[2] = u32W3 ^ u32W1;
19165 puDst->au32[1] = u32W4 ^ u32W2;
19166 puDst->au32[0] = u32W5 ^ u32W3;
19167}
19168
19169/**
19170 * SHA1MSG2
19171 */
19172IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19173{
19174 uint32_t u32W13 = puSrc->au32[2];
19175 uint32_t u32W14 = puSrc->au32[1];
19176 uint32_t u32W15 = puSrc->au32[0];
19177 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19178 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19179 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19180 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19181
19182 puDst->au32[3] = u32W16;
19183 puDst->au32[2] = u32W17;
19184 puDst->au32[1] = u32W18;
19185 puDst->au32[0] = u32W19;
19186}
19187
19188/**
19189 * SHA1RNDS4
19190 */
19191typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19192typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19193
19194static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19195{
19196 return (u32B & u32C) ^ (~u32B & u32D);
19197}
19198
19199static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19200{
19201 return u32B ^ u32C ^ u32D;
19202}
19203
19204static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19205{
19206 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19207}
19208
19209static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19210{
19211 return u32B ^ u32C ^ u32D;
19212}
19213
19214IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19215{
19216 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19217 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19218
19219 uint32_t au32A[5];
19220 uint32_t au32B[5];
19221 uint32_t au32C[5];
19222 uint32_t au32D[5];
19223 uint32_t au32E[5];
19224 uint32_t au32W[4];
19225 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19226 uint32_t u32K = s_au32K[bEvil & 0x3];
19227
19228 au32A[0] = puDst->au32[3];
19229 au32B[0] = puDst->au32[2];
19230 au32C[0] = puDst->au32[1];
19231 au32D[0] = puDst->au32[0];
19232 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19233 au32W[i] = puSrc->au32[3 - i];
19234
19235 /* Round 0 is a bit different than the other rounds. */
19236 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19237 au32B[1] = au32A[0];
19238 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19239 au32D[1] = au32C[0];
19240 au32E[1] = au32D[0];
19241
19242 for (uint32_t i = 1; i <= 3; i++)
19243 {
19244 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19245 au32B[i + 1] = au32A[i];
19246 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19247 au32D[i + 1] = au32C[i];
19248 au32E[i + 1] = au32D[i];
19249 }
19250
19251 puDst->au32[3] = au32A[4];
19252 puDst->au32[2] = au32B[4];
19253 puDst->au32[1] = au32C[4];
19254 puDst->au32[0] = au32D[4];
19255}
19256
19257
19258/**
19259 * SHA256MSG1
19260 */
19261DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19262{
19263 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19264}
19265
19266IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19267{
19268 uint32_t u32W4 = puSrc->au32[0];
19269 uint32_t u32W3 = puDst->au32[3];
19270 uint32_t u32W2 = puDst->au32[2];
19271 uint32_t u32W1 = puDst->au32[1];
19272 uint32_t u32W0 = puDst->au32[0];
19273
19274 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19275 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19276 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19277 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19278}
19279
19280/**
19281 * SHA256MSG2
19282 */
19283DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19284{
19285 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19286}
19287
19288IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19289{
19290 uint32_t u32W14 = puSrc->au32[2];
19291 uint32_t u32W15 = puSrc->au32[3];
19292 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19293 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19294 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19295 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19296
19297 puDst->au32[3] = u32W19;
19298 puDst->au32[2] = u32W18;
19299 puDst->au32[1] = u32W17;
19300 puDst->au32[0] = u32W16;
19301}
19302
19303/**
19304 * SHA256RNDS2
19305 */
19306DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19307{
19308 return (u32X & u32Y) ^ (~u32X & u32Z);
19309}
19310
19311DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19312{
19313 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19314}
19315
19316DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19317{
19318 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19319}
19320
19321DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19322{
19323 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19324}
19325
19326IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19327{
19328 uint32_t au32A[3];
19329 uint32_t au32B[3];
19330 uint32_t au32C[3];
19331 uint32_t au32D[3];
19332 uint32_t au32E[3];
19333 uint32_t au32F[3];
19334 uint32_t au32G[3];
19335 uint32_t au32H[3];
19336 uint32_t au32WK[2];
19337
19338 au32A[0] = puSrc->au32[3];
19339 au32B[0] = puSrc->au32[2];
19340 au32C[0] = puDst->au32[3];
19341 au32D[0] = puDst->au32[2];
19342 au32E[0] = puSrc->au32[1];
19343 au32F[0] = puSrc->au32[0];
19344 au32G[0] = puDst->au32[1];
19345 au32H[0] = puDst->au32[0];
19346
19347 au32WK[0] = puXmm0Constants->au32[0];
19348 au32WK[1] = puXmm0Constants->au32[1];
19349
19350 for (uint32_t i = 0; i < 2; i++)
19351 {
19352 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19353 + iemAImpl_sha256_upper_sigma1(au32E[i])
19354 + au32WK[i]
19355 + au32H[i]
19356 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19357 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19358 au32B[i + 1] = au32A[i];
19359 au32C[i + 1] = au32B[i];
19360 au32D[i + 1] = au32C[i];
19361 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19362 + iemAImpl_sha256_upper_sigma1(au32E[i])
19363 + au32WK[i]
19364 + au32H[i]
19365 + au32D[i];
19366 au32F[i + 1] = au32E[i];
19367 au32G[i + 1] = au32F[i];
19368 au32H[i + 1] = au32G[i];
19369 }
19370
19371 puDst->au32[3] = au32A[2];
19372 puDst->au32[2] = au32B[2];
19373 puDst->au32[1] = au32E[2];
19374 puDst->au32[0] = au32F[2];
19375}
19376
19377
19378/**
19379 * ADCX
19380 */
19381#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19382 do \
19383 { \
19384 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
19385 a_Type uTmp = *puDst + uSrc; \
19386 if (uTmp < uSrc) \
19387 *pfEFlags |= (a_Flag); \
19388 else \
19389 *pfEFlags &= ~(a_Flag); \
19390 if ( uTmp == a_Max \
19391 && f) \
19392 *pfEFlags |= (a_Flag); \
19393 if (f) \
19394 uTmp++; \
19395 *puDst = uTmp; \
19396 } \
19397 while (0)
19398
19399IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19400{
19401 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19402}
19403
19404IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19405{
19406 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19407}
19408
19409# if defined(IEM_WITHOUT_ASSEMBLY)
19410
19411IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19412{
19413 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19414}
19415
19416IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19417{
19418 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19419}
19420
19421#endif
19422
19423
19424/**
19425 * ADOX
19426 */
19427IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19428{
19429 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19430}
19431
19432IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19433{
19434 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19435}
19436
19437# if defined(IEM_WITHOUT_ASSEMBLY)
19438
19439IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19440{
19441 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19442}
19443
19444IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19445{
19446 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19447}
19448
19449# endif
19450
19451
19452/**
19453 * MPSADBW
19454 */
19455IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19456{
19457 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19458 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19459 int16_t ai16Src1[11];
19460 int16_t ai16Src2[4];
19461
19462 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19463 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19464
19465 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19466 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19467
19468 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19469 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19470 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19471 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19472 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19473}
19474
19475
19476IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19477{
19478 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19479 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19480 int16_t ai16Src1[11];
19481 int16_t ai16Src2[4];
19482
19483 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19484 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19485
19486 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19487 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19488
19489 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19490 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19491 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19492 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19493 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19494}
19495
19496
19497IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19498{
19499 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19500 RTUINT256U const uSrc2 = *puSrc2;
19501 ASMCompilerBarrier();
19502 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19503 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19504}
19505
19506
19507/**
19508 * VPERM2I128
19509 */
19510IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19511{
19512 if (bImm & RT_BIT(3))
19513 {
19514 puDst->au64[0] = 0;
19515 puDst->au64[1] = 0;
19516 }
19517 else
19518 {
19519 switch (bImm & 0x3)
19520 {
19521 case 0:
19522 puDst->au64[0] = puSrc1->au64[0];
19523 puDst->au64[1] = puSrc1->au64[1];
19524 break;
19525 case 1:
19526 puDst->au64[0] = puSrc1->au64[2];
19527 puDst->au64[1] = puSrc1->au64[3];
19528 break;
19529 case 2:
19530 puDst->au64[0] = puSrc2->au64[0];
19531 puDst->au64[1] = puSrc2->au64[1];
19532 break;
19533 case 3:
19534 puDst->au64[0] = puSrc2->au64[2];
19535 puDst->au64[1] = puSrc2->au64[3];
19536 break;
19537 }
19538 }
19539
19540 if (bImm & RT_BIT(7))
19541 {
19542 puDst->au64[2] = 0;
19543 puDst->au64[3] = 0;
19544 }
19545 else
19546 {
19547 switch ((bImm >> 4) & 0x3)
19548 {
19549 case 0:
19550 puDst->au64[2] = puSrc1->au64[0];
19551 puDst->au64[3] = puSrc1->au64[1];
19552 break;
19553 case 1:
19554 puDst->au64[2] = puSrc1->au64[2];
19555 puDst->au64[3] = puSrc1->au64[3];
19556 break;
19557 case 2:
19558 puDst->au64[2] = puSrc2->au64[0];
19559 puDst->au64[3] = puSrc2->au64[1];
19560 break;
19561 case 3:
19562 puDst->au64[2] = puSrc2->au64[2];
19563 puDst->au64[3] = puSrc2->au64[3];
19564 break;
19565 }
19566 }
19567}
19568
19569
19570/**
19571 * VPERM2F128
19572 */
19573IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19574{
19575 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19576}
19577
19578
19579/**
19580 * DPPS
19581 */
19582IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19583{
19584 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19585 AssertReleaseFailed();
19586}
19587
19588
19589/**
19590 * DPPD
19591 */
19592IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19593{
19594 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19595 AssertReleaseFailed();
19596}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette