VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 103205

Last change on this file since 103205 was 103184, checked in by vboxsync, 14 months ago

Fix broken build and broken 'with assembly' ability to run VMs at all; from r161447:

VMM/IEM: IEMIMPL_SHIFT_OPT_F3 macro to emit vpsll-family instruction blocks
VMM/IEM: add missing 'non-fallback' iemAImpl_vpsll[wdq]_imm_u{128,256} functions

bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 195.6 KB
Line 
1; $Id: IEMAllAImpl.asm 103184 2024-02-03 20:16:32Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283; @param 4 Force loading the flags.
284;
285%macro IEM_MAYBE_LOAD_FLAGS 3-4 1
286 %if (%3 + %4) != 0
287 pushf ; store current flags
288 mov T0_32, [%1] ; load the guest flags
289 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
290 and T0_32, (%2 | %3) ; select the modified and undefined flags.
291 or [xSP], T0 ; merge guest flags with host flags.
292 popf ; load the mixed flags.
293 %endif
294%endmacro
295
296;;
297; Load the relevant flags from [%1].
298;
299; @remarks Clobbers T0, stack. Changes EFLAGS.
300; @param A2 The register pointing to the flags.
301; @param 1 The parameter (A0..A3) pointing to the eflags.
302; @param 2 The set of flags to load.
303; @param 3 The set of undefined flags.
304;
305%macro IEM_LOAD_FLAGS 3
306 pushf ; store current flags
307 mov T0_32, [%1] ; load the guest flags
308 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
309 and T0_32, (%2 | %3) ; select the modified and undefined flags.
310 or [xSP], T0 ; merge guest flags with host flags.
311 popf ; load the mixed flags.
312%endmacro
313
314;;
315; Update the flag.
316;
317; @remarks Clobbers T0, T1, stack.
318; @param 1 The register pointing to the EFLAGS.
319; @param 2 The mask of modified flags to save.
320; @param 3 The mask of undefined flags to (maybe) save.
321;
322%macro IEM_SAVE_FLAGS 3
323 %if (%2 | %3) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; flags
327 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
328 and T1_32, (%2 | %3) ; select the modified and undefined flags.
329 or T0_32, T1_32 ; combine the flags.
330 mov [%1], T0_32 ; save the flags.
331 %endif
332%endmacro
333
334;;
335; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
336;
337; @remarks Clobbers T0, T1, stack.
338; @param 1 The register pointing to the EFLAGS.
339; @param 2 The mask of modified flags to save.
340; @param 3 Mask of additional flags to always clear
341; @param 4 Mask of additional flags to always set.
342;
343%macro IEM_SAVE_AND_ADJUST_FLAGS 4
344 %if (%2 | %3 | %4) != 0
345 pushf
346 pop T1
347 mov T0_32, [%1] ; load flags.
348 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
349 and T1_32, (%2) ; select the modified flags.
350 or T0_32, T1_32 ; combine the flags.
351 %if (%4) != 0
352 or T0_32, %4 ; add the always set flags.
353 %endif
354 mov [%1], T0_32 ; save the result.
355 %endif
356%endmacro
357
358;;
359; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
360; signed input (%4[%5]) and parity index (%6).
361;
362; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
363; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
364; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
365;
366; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
367; @param 1 The register pointing to the EFLAGS.
368; @param 2 The mask of modified flags to save.
369; @param 3 Mask of additional flags to always clear
370; @param 4 The result register to set SF by.
371; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
372; @param 6 The (full) register containing the parity table index. Will be modified!
373
374%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
375 %ifdef RT_ARCH_AMD64
376 pushf
377 pop T2
378 %else
379 push T0
380 pushf
381 pop T0
382 %endif
383 mov T1_32, [%1] ; load flags.
384 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
385 %ifdef RT_ARCH_AMD64
386 and T2_32, (%2) ; select the modified flags.
387 or T1_32, T2_32 ; combine the flags.
388 %else
389 and T0_32, (%2) ; select the modified flags.
390 or T1_32, T0_32 ; combine the flags.
391 pop T0
392 %endif
393
394 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
395 bt %4, %5 - 1
396 jnc %%sf_clear
397 or T1_32, X86_EFL_SF
398 %%sf_clear:
399
400 ; Parity last.
401 and %6, 0xff
402 %ifdef RT_ARCH_AMD64
403 lea T2, [NAME(g_afParity) xWrtRIP]
404 or T1_8, [T2 + %6]
405 %else
406 or T1_8, [NAME(g_afParity) + %6]
407 %endif
408
409 mov [%1], T1_32 ; save the result.
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419;
420%macro IEM_ADJUST_FLAGS 3
421 %if (%2 | %3) != 0
422 mov T0_32, [%1] ; Load flags.
423 %if (%2) != 0
424 and T0_32, ~(%2) ; Remove the always cleared flags.
425 %endif
426 %if (%3) != 0
427 or T0_32, %3 ; Add the always set flags.
428 %endif
429 mov [%1], T0_32 ; Save the result.
430 %endif
431%endmacro
432
433;;
434; Calculates the new EFLAGS using fixed clear and set bit masks.
435;
436; @remarks Clobbers T0, %4, EFLAGS.
437; @param 1 The register pointing to the EFLAGS.
438; @param 2 Mask of additional flags to always clear
439; @param 3 Mask of additional flags to always set.
440; @param 4 The (full) register containing the parity table index. Will be modified!
441;
442%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
443 mov T0_32, [%1] ; Load flags.
444 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
445 %if (%3) != 0
446 or T0_32, %3 ; Add the always set flags.
447 %endif
448 and %4, 0xff
449 %ifdef RT_ARCH_AMD64
450 lea T2, [NAME(g_afParity) xWrtRIP]
451 or T0_8, [T2 + %4]
452 %else
453 or T0_8, [NAME(g_afParity) + %4]
454 %endif
455 mov [%1], T0_32 ; Save the result.
456%endmacro
457
458
459;;
460; Checks that the size expression %1 matches %2 adjusted according to
461; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
462; @param 1 The jump array size assembly expression.
463; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
464;
465%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
467 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
469 %else
470 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
471 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
472 %endif
473%endmacro
474
475
476;*********************************************************************************************************************************
477;* External Symbols *
478;*********************************************************************************************************************************
479extern NAME(g_afParity)
480
481
482;;
483; Macro for implementing a binary operator.
484;
485; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
486; variants, except on 32-bit system where the 64-bit accesses requires hand
487; coding.
488;
489; All the functions takes a pointer to the destination memory operand in A0,
490; the source register operand in A1 and a pointer to eflags in A2.
491;
492; @param 1 The instruction mnemonic.
493; @param 2 Non-zero if there should be a locked version.
494; @param 3 The modified flags.
495; @param 4 The undefined flags.
496; @param 5 Force flag loading (ADC, SBC).
497;
498%macro IEMIMPL_BIN_OP 5
499BEGINCODE
500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
501 PROLOGUE_3_ARGS
502 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
503 %1 byte [A0], A1_8
504 IEM_SAVE_FLAGS A2, %3, %4
505 EPILOGUE_3_ARGS
506ENDPROC iemAImpl_ %+ %1 %+ _u8
507
508BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509 PROLOGUE_3_ARGS
510 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
511 %1 word [A0], A1_16
512 IEM_SAVE_FLAGS A2, %3, %4
513 EPILOGUE_3_ARGS
514ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517 PROLOGUE_3_ARGS
518 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
519 %1 dword [A0], A1_32
520 IEM_SAVE_FLAGS A2, %3, %4
521 EPILOGUE_3_ARGS
522ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524 %ifdef RT_ARCH_AMD64
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
528 %1 qword [A0], A1
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS_EX 8
531ENDPROC iemAImpl_ %+ %1 %+ _u64
532 %endif ; RT_ARCH_AMD64
533
534 %if %2 != 0 ; locked versions requested?
535
536BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
537 PROLOGUE_3_ARGS
538 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
539 lock %1 byte [A0], A1_8
540 IEM_SAVE_FLAGS A2, %3, %4
541 EPILOGUE_3_ARGS
542ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
543
544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
545 PROLOGUE_3_ARGS
546 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
547 lock %1 word [A0], A1_16
548 IEM_SAVE_FLAGS A2, %3, %4
549 EPILOGUE_3_ARGS
550ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
551
552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
553 PROLOGUE_3_ARGS
554 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
555 lock %1 dword [A0], A1_32
556 IEM_SAVE_FLAGS A2, %3, %4
557 EPILOGUE_3_ARGS
558ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
559
560 %ifdef RT_ARCH_AMD64
561BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
562 PROLOGUE_3_ARGS
563 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
564 lock %1 qword [A0], A1
565 IEM_SAVE_FLAGS A2, %3, %4
566 EPILOGUE_3_ARGS_EX 8
567ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
568 %endif ; RT_ARCH_AMD64
569 %endif ; locked
570%endmacro
571
572; instr,lock, modified-flags, undefined flags, force loading flags
573IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
574IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
575IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
576IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
577IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
578IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
579IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
580IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
581IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
582
583
584;;
585; Macro for implementing a binary operator, VEX variant with separate input/output.
586;
587; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
588; where the 64-bit accesses requires hand coding.
589;
590; All the functions takes a pointer to the destination memory operand in A0,
591; the first source register operand in A1, the second source register operand
592; in A2 and a pointer to eflags in A3.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
602 %1 T0_32, A1_32, A2_32
603 mov [A0], T0_32
604 IEM_SAVE_FLAGS A3, %2, %3
605 EPILOGUE_4_ARGS
606ENDPROC iemAImpl_ %+ %1 %+ _u32
607
608 %ifdef RT_ARCH_AMD64
609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
610 PROLOGUE_4_ARGS
611 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
612 %1 T0, A1, A2
613 mov [A0], T0
614 IEM_SAVE_FLAGS A3, %2, %3
615 EPILOGUE_4_ARGS
616ENDPROC iemAImpl_ %+ %1 %+ _u64
617 %endif ; RT_ARCH_AMD64
618%endmacro
619
620; instr, modified-flags, undefined-flags
621IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
623IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624
625;;
626; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
627;
628; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
629; where the 64-bit accesses requires hand coding.
630;
631; All the functions takes a pointer to the destination memory operand in A0,
632; the source register operand in A1 and a pointer to eflags in A2.
633;
634; @param 1 The instruction mnemonic.
635; @param 2 The modified flags.
636; @param 3 The undefined flags.
637;
638%macro IEMIMPL_VEX_BIN_OP_2 3
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0_32, [A0]
643 %1 T0_32, A1_32
644 mov [A0], T0_32
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u32
648
649 %ifdef RT_ARCH_AMD64
650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
651 PROLOGUE_4_ARGS
652 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
653 mov T0, [A0]
654 %1 T0, A1
655 mov [A0], T0
656 IEM_SAVE_FLAGS A2, %2, %3
657 EPILOGUE_4_ARGS
658ENDPROC iemAImpl_ %+ %1 %+ _u64
659 %endif ; RT_ARCH_AMD64
660%endmacro
661
662; instr, modified-flags, undefined-flags
663IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
665IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
666
667
668;;
669; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
670;
671; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
672; where the 64-bit accesses requires hand coding.
673;
674; All the functions takes a pointer to the destination memory operand in A0,
675; the first source register operand in A1, the second source register operand
676; in A2 and a pointer to eflags in A3.
677;
678; @param 1 The instruction mnemonic.
679; @param 2 Fallback instruction if applicable.
680; @param 3 Whether to emit fallback or not.
681;
682%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
684 PROLOGUE_3_ARGS
685 %1 T0_32, A1_32, A2_32
686 mov [A0], T0_32
687 EPILOGUE_3_ARGS
688ENDPROC iemAImpl_ %+ %1 %+ _u32
689
690 %if %3
691BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
692 PROLOGUE_3_ARGS
693 %ifdef ASM_CALL64_GCC
694 mov cl, A2_8
695 %2 A1_32, cl
696 mov [A0], A1_32
697 %else
698 xchg A2, A0
699 %2 A1_32, cl
700 mov [A2], A1_32
701 %endif
702 EPILOGUE_3_ARGS
703ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
704 %endif
705
706 %ifdef RT_ARCH_AMD64
707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
708 PROLOGUE_3_ARGS
709 %1 T0, A1, A2
710 mov [A0], T0
711 EPILOGUE_3_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u64
713
714 %if %3
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
716 PROLOGUE_3_ARGS
717 %ifdef ASM_CALL64_GCC
718 mov cl, A2_8
719 %2 A1, cl
720 mov [A0], A1_32
721 %else
722 xchg A2, A0
723 %2 A1, cl
724 mov [A2], A1_32
725 %endif
726 mov [A0], A1
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
729 %endif
730 %endif ; RT_ARCH_AMD64
731%endmacro
732
733; instr, fallback instr, emit fallback
734IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
735IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
736IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
737IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
738IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
739
740
741;
742; RORX uses a immediate byte for the shift count, so we only do
743; fallback implementation of that one.
744;
745BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
746 PROLOGUE_3_ARGS
747 %ifdef ASM_CALL64_GCC
748 mov cl, A2_8
749 ror A1_32, cl
750 mov [A0], A1_32
751 %else
752 xchg A2, A0
753 ror A1_32, cl
754 mov [A2], A1_32
755 %endif
756 EPILOGUE_3_ARGS
757ENDPROC iemAImpl_rorx_u32
758
759 %ifdef RT_ARCH_AMD64
760BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
761 PROLOGUE_3_ARGS
762 %ifdef ASM_CALL64_GCC
763 mov cl, A2_8
764 ror A1, cl
765 mov [A0], A1
766 %else
767 xchg A2, A0
768 ror A1, cl
769 mov [A2], A1
770 %endif
771 EPILOGUE_3_ARGS
772ENDPROC iemAImpl_rorx_u64
773 %endif ; RT_ARCH_AMD64
774
775
776;
777; MULX
778;
779BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2_32 is EDX - prefect
783 mulx T0_32, T1_32, A3_32
784 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0_32
786%else
787 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
788 xchg A1, A2
789 mulx T0_32, T1_32, A3_32
790 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0_32
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u32
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2_32 is EDX, T0_32 is EAX
801 mov eax, A3_32
802 mul A2_32
803 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], edx
805%else
806 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
807 xchg A1, A2
808 mov eax, A3_32
809 mul A2_32
810 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u32_fallback
815
816%ifdef RT_ARCH_AMD64
817BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
818 PROLOGUE_4_ARGS
819%ifdef ASM_CALL64_GCC
820 ; A2 is RDX - prefect
821 mulx T0, T1, A3
822 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
823 mov [A0], T0
824%else
825 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
826 xchg A1, A2
827 mulx T0, T1, A3
828 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
829 mov [A0], T0
830%endif
831 EPILOGUE_4_ARGS
832ENDPROC iemAImpl_mulx_u64
833
834
835BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
836 PROLOGUE_4_ARGS
837%ifdef ASM_CALL64_GCC
838 ; A2 is RDX, T0 is RAX
839 mov rax, A3
840 mul A2
841 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
842 mov [A0], rdx
843%else
844 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
845 xchg A1, A2
846 mov rax, A3
847 mul A2
848 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%endif
851 EPILOGUE_4_ARGS
852ENDPROC iemAImpl_mulx_u64_fallback
853
854%endif
855
856
857;;
858; Macro for implementing a bit operator.
859;
860; This will generate code for the 16, 32 and 64 bit accesses with locked
861; variants, except on 32-bit system where the 64-bit accesses requires hand
862; coding.
863;
864; All the functions takes a pointer to the destination memory operand in A0,
865; the source register operand in A1 and a pointer to eflags in A2.
866;
867; @param 1 The instruction mnemonic.
868; @param 2 Non-zero if there should be a locked version.
869; @param 3 The modified flags.
870; @param 4 The undefined flags.
871;
872%macro IEMIMPL_BIT_OP 4
873BEGINCODE
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64
898 %endif ; RT_ARCH_AMD64
899
900 %if %2 != 0 ; locked versions requested?
901
902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
903 PROLOGUE_3_ARGS
904 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
905 lock %1 word [A0], A1_16
906 IEM_SAVE_FLAGS A2, %3, %4
907 EPILOGUE_3_ARGS
908ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
913 lock %1 dword [A0], A1_32
914 IEM_SAVE_FLAGS A2, %3, %4
915 EPILOGUE_3_ARGS
916ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
917
918 %ifdef RT_ARCH_AMD64
919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
920 PROLOGUE_3_ARGS
921 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
922 lock %1 qword [A0], A1
923 IEM_SAVE_FLAGS A2, %3, %4
924 EPILOGUE_3_ARGS_EX 8
925ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
926 %endif ; RT_ARCH_AMD64
927 %endif ; locked
928%endmacro
929; modified efl, undefined eflags
930IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
932IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
933IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
934
935;;
936; Macro for implementing a bit search operator.
937;
938; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
939; system where the 64-bit accesses requires hand coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; In the ZF case the destination register is 'undefined', however it seems that
945; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
946; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
947; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
948; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
949;
950; @param 1 The instruction mnemonic.
951; @param 2 The modified flags.
952; @param 3 The undefined flags.
953; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
954;
955%macro IEMIMPL_BIT_OP2 4
956BEGINCODE
957BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
958 PROLOGUE_3_ARGS
959 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
960 %1 T0_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T0_16
965.unchanged_dst:
966 IEM_SAVE_FLAGS A2, %2, %3
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
971 PROLOGUE_3_ARGS
972 %1 T1_16, A1_16
973%if %4 != 0
974 jz .unchanged_dst
975%endif
976 mov [A0], T1_16
977 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
978 EPILOGUE_3_ARGS
979.unchanged_dst:
980 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
983
984BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
985 PROLOGUE_3_ARGS
986 %1 T0_16, A1_16
987%if %4 != 0
988 jz .unchanged_dst
989%endif
990 mov [A0], T0_16
991.unchanged_dst:
992 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
995
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1000 %1 T0_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T0_32
1005.unchanged_dst:
1006 IEM_SAVE_FLAGS A2, %2, %3
1007 EPILOGUE_3_ARGS
1008ENDPROC iemAImpl_ %+ %1 %+ _u32
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1011 PROLOGUE_3_ARGS
1012 %1 T1_32, A1_32
1013%if %4 != 0
1014 jz .unchanged_dst
1015%endif
1016 mov [A0], T1_32
1017 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1018 EPILOGUE_3_ARGS
1019.unchanged_dst:
1020 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1023
1024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1025 PROLOGUE_3_ARGS
1026 %1 T0_32, A1_32
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T0_32
1031.unchanged_dst:
1032 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1033 EPILOGUE_3_ARGS
1034ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1035
1036
1037 %ifdef RT_ARCH_AMD64
1038
1039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1040 PROLOGUE_3_ARGS
1041 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1042 %1 T0, A1
1043%if %4 != 0
1044 jz .unchanged_dst
1045%endif
1046 mov [A0], T0
1047.unchanged_dst:
1048 IEM_SAVE_FLAGS A2, %2, %3
1049 EPILOGUE_3_ARGS_EX 8
1050ENDPROC iemAImpl_ %+ %1 %+ _u64
1051
1052BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1053 PROLOGUE_3_ARGS
1054 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1055 %1 T1, A1
1056%if %4 != 0
1057 jz .unchanged_dst
1058%endif
1059 mov [A0], T1
1060 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1061 EPILOGUE_3_ARGS
1062.unchanged_dst:
1063 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1064 EPILOGUE_3_ARGS
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1066
1067BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1068 PROLOGUE_3_ARGS
1069 %1 T0, A1
1070%if %4 != 0
1071 jz .unchanged_dst
1072%endif
1073 mov [A0], T0
1074.unchanged_dst:
1075 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1076 EPILOGUE_3_ARGS_EX 8
1077ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1078
1079 %endif ; RT_ARCH_AMD64
1080%endmacro
1081
1082IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1083IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1084IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1085IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1086
1087
1088;;
1089; Macro for implementing POPCNT.
1090;
1091; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1092; system where the 64-bit accesses requires hand coding.
1093;
1094; All the functions takes a pointer to the destination memory operand in A0,
1095; the source register operand in A1 and a pointer to eflags in A2.
1096;
1097; ASSUMES Intel and AMD set EFLAGS the same way.
1098;
1099; ASSUMES the instruction does not support memory destination.
1100;
1101; @param 1 The instruction mnemonic.
1102; @param 2 The modified flags.
1103; @param 3 The undefined flags.
1104;
1105%macro IEMIMPL_BIT_OP3 3
1106BEGINCODE
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1108 PROLOGUE_3_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1110 %1 T0_16, A1_16
1111 mov [A0], T0_16
1112 IEM_SAVE_FLAGS A2, %2, %3
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u16
1115
1116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1119 %1 T0_32, A1_32
1120 mov [A0], T0_32
1121 IEM_SAVE_FLAGS A2, %2, %3
1122 EPILOGUE_3_ARGS
1123ENDPROC iemAImpl_ %+ %1 %+ _u32
1124
1125 %ifdef RT_ARCH_AMD64
1126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1129 %1 T0, A1
1130 mov [A0], T0
1131 IEM_SAVE_FLAGS A2, %2, %3
1132 EPILOGUE_3_ARGS_EX 8
1133ENDPROC iemAImpl_ %+ %1 %+ _u64
1134 %endif ; RT_ARCH_AMD64
1135%endmacro
1136IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1137
1138
1139;
1140; IMUL is also a similar but yet different case (no lock, no mem dst).
1141; The rDX:rAX variant of imul is handled together with mul further down.
1142;
1143BEGINCODE
1144; @param 1 EFLAGS that are modified.
1145; @param 2 Undefined EFLAGS.
1146; @param 3 Function suffix.
1147; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1148; 2 for AMD (set AF, clear PF, ZF and SF).
1149%macro IEMIMPL_IMUL_TWO 4
1150BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1151 PROLOGUE_3_ARGS
1152 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1153 imul A1_16, word [A0]
1154 mov [A0], A1_16
1155 %if %4 != 1
1156 IEM_SAVE_FLAGS A2, %1, %2
1157 %else
1158 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1159 %endif
1160 EPILOGUE_3_ARGS
1161ENDPROC iemAImpl_imul_two_u16 %+ %3
1162
1163BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1164 PROLOGUE_3_ARGS
1165 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1166 imul A1_32, dword [A0]
1167 mov [A0], A1_32
1168 %if %4 != 1
1169 IEM_SAVE_FLAGS A2, %1, %2
1170 %else
1171 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1172 %endif
1173 EPILOGUE_3_ARGS
1174ENDPROC iemAImpl_imul_two_u32 %+ %3
1175
1176 %ifdef RT_ARCH_AMD64
1177BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1178 PROLOGUE_3_ARGS
1179 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1180 imul A1, qword [A0]
1181 mov [A0], A1
1182 %if %4 != 1
1183 IEM_SAVE_FLAGS A2, %1, %2
1184 %else
1185 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1186 %endif
1187 EPILOGUE_3_ARGS_EX 8
1188ENDPROC iemAImpl_imul_two_u64 %+ %3
1189 %endif ; RT_ARCH_AMD64
1190%endmacro
1191IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1192IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1193IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1194
1195
1196;
1197; XCHG for memory operands. This implies locking. No flag changes.
1198;
1199; Each function takes two arguments, first the pointer to the memory,
1200; then the pointer to the register. They all return void.
1201;
1202BEGINCODE
1203BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1204 PROLOGUE_2_ARGS
1205 mov T0_8, [A1]
1206 xchg [A0], T0_8
1207 mov [A1], T0_8
1208 EPILOGUE_2_ARGS
1209ENDPROC iemAImpl_xchg_u8_locked
1210
1211BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1212 PROLOGUE_2_ARGS
1213 mov T0_16, [A1]
1214 xchg [A0], T0_16
1215 mov [A1], T0_16
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u16_locked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_32, [A1]
1222 xchg [A0], T0_32
1223 mov [A1], T0_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_locked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 xchg [A0], T0
1232 mov [A1], T0
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u64_locked
1235%endif
1236
1237; Unlocked variants for fDisregardLock mode.
1238
1239BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1240 PROLOGUE_2_ARGS
1241 mov T0_8, [A1]
1242 mov T1_8, [A0]
1243 mov [A0], T0_8
1244 mov [A1], T1_8
1245 EPILOGUE_2_ARGS
1246ENDPROC iemAImpl_xchg_u8_unlocked
1247
1248BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1249 PROLOGUE_2_ARGS
1250 mov T0_16, [A1]
1251 mov T1_16, [A0]
1252 mov [A0], T0_16
1253 mov [A1], T1_16
1254 EPILOGUE_2_ARGS
1255ENDPROC iemAImpl_xchg_u16_unlocked
1256
1257BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1258 PROLOGUE_2_ARGS
1259 mov T0_32, [A1]
1260 mov T1_32, [A0]
1261 mov [A0], T0_32
1262 mov [A1], T1_32
1263 EPILOGUE_2_ARGS
1264ENDPROC iemAImpl_xchg_u32_unlocked
1265
1266%ifdef RT_ARCH_AMD64
1267BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1268 PROLOGUE_2_ARGS
1269 mov T0, [A1]
1270 mov T1, [A0]
1271 mov [A0], T0
1272 mov [A1], T1
1273 EPILOGUE_2_ARGS
1274ENDPROC iemAImpl_xchg_u64_unlocked
1275%endif
1276
1277
1278;
1279; XADD for memory operands.
1280;
1281; Each function takes three arguments, first the pointer to the
1282; memory/register, then the pointer to the register, and finally a pointer to
1283; eflags. They all return void.
1284;
1285BEGINCODE
1286BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1287 PROLOGUE_3_ARGS
1288 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1289 mov T0_8, [A1]
1290 xadd [A0], T0_8
1291 mov [A1], T0_8
1292 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1293 EPILOGUE_3_ARGS
1294ENDPROC iemAImpl_xadd_u8
1295
1296BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1297 PROLOGUE_3_ARGS
1298 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1299 mov T0_16, [A1]
1300 xadd [A0], T0_16
1301 mov [A1], T0_16
1302 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1303 EPILOGUE_3_ARGS
1304ENDPROC iemAImpl_xadd_u16
1305
1306BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1309 mov T0_32, [A1]
1310 xadd [A0], T0_32
1311 mov [A1], T0_32
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u32
1315
1316%ifdef RT_ARCH_AMD64
1317BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1320 mov T0, [A1]
1321 xadd [A0], T0
1322 mov [A1], T0
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u64
1326%endif ; RT_ARCH_AMD64
1327
1328BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1329 PROLOGUE_3_ARGS
1330 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1331 mov T0_8, [A1]
1332 lock xadd [A0], T0_8
1333 mov [A1], T0_8
1334 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_xadd_u8_locked
1337
1338BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1341 mov T0_16, [A1]
1342 lock xadd [A0], T0_16
1343 mov [A1], T0_16
1344 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1345 EPILOGUE_3_ARGS
1346ENDPROC iemAImpl_xadd_u16_locked
1347
1348BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1351 mov T0_32, [A1]
1352 lock xadd [A0], T0_32
1353 mov [A1], T0_32
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u32_locked
1357
1358%ifdef RT_ARCH_AMD64
1359BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1362 mov T0, [A1]
1363 lock xadd [A0], T0
1364 mov [A1], T0
1365 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1366 EPILOGUE_3_ARGS
1367ENDPROC iemAImpl_xadd_u64_locked
1368%endif ; RT_ARCH_AMD64
1369
1370
1371;
1372; CMPXCHG8B.
1373;
1374; These are tricky register wise, so the code is duplicated for each calling
1375; convention.
1376;
1377; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1378;
1379; C-proto:
1380; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1381; uint32_t *pEFlags));
1382;
1383; Note! Identical to iemAImpl_cmpxchg16b.
1384;
1385BEGINCODE
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1387%ifdef RT_ARCH_AMD64
1388 %ifdef ASM_CALL64_MSC
1389 push rbx
1390
1391 mov r11, rdx ; pu64EaxEdx (is also T1)
1392 mov r10, rcx ; pu64Dst
1393
1394 mov ebx, [r8]
1395 mov ecx, [r8 + 4]
1396 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1397 mov eax, [r11]
1398 mov edx, [r11 + 4]
1399
1400 cmpxchg8b [r10]
1401
1402 mov [r11], eax
1403 mov [r11 + 4], edx
1404 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1405
1406 pop rbx
1407 ret
1408 %else
1409 push rbx
1410
1411 mov r10, rcx ; pEFlags
1412 mov r11, rdx ; pu64EbxEcx (is also T1)
1413
1414 mov ebx, [r11]
1415 mov ecx, [r11 + 4]
1416 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1417 mov eax, [rsi]
1418 mov edx, [rsi + 4]
1419
1420 cmpxchg8b [rdi]
1421
1422 mov [rsi], eax
1423 mov [rsi + 4], edx
1424 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1425
1426 pop rbx
1427 ret
1428
1429 %endif
1430%else
1431 push esi
1432 push edi
1433 push ebx
1434 push ebp
1435
1436 mov edi, ecx ; pu64Dst
1437 mov esi, edx ; pu64EaxEdx
1438 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1439 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1440
1441 mov ebx, [ecx]
1442 mov ecx, [ecx + 4]
1443 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1444 mov eax, [esi]
1445 mov edx, [esi + 4]
1446
1447 cmpxchg8b [edi]
1448
1449 mov [esi], eax
1450 mov [esi + 4], edx
1451 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1452
1453 pop ebp
1454 pop ebx
1455 pop edi
1456 pop esi
1457 ret 8
1458%endif
1459ENDPROC iemAImpl_cmpxchg8b
1460
1461BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1462%ifdef RT_ARCH_AMD64
1463 %ifdef ASM_CALL64_MSC
1464 push rbx
1465
1466 mov r11, rdx ; pu64EaxEdx (is also T1)
1467 mov r10, rcx ; pu64Dst
1468
1469 mov ebx, [r8]
1470 mov ecx, [r8 + 4]
1471 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1472 mov eax, [r11]
1473 mov edx, [r11 + 4]
1474
1475 lock cmpxchg8b [r10]
1476
1477 mov [r11], eax
1478 mov [r11 + 4], edx
1479 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1480
1481 pop rbx
1482 ret
1483 %else
1484 push rbx
1485
1486 mov r10, rcx ; pEFlags
1487 mov r11, rdx ; pu64EbxEcx (is also T1)
1488
1489 mov ebx, [r11]
1490 mov ecx, [r11 + 4]
1491 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1492 mov eax, [rsi]
1493 mov edx, [rsi + 4]
1494
1495 lock cmpxchg8b [rdi]
1496
1497 mov [rsi], eax
1498 mov [rsi + 4], edx
1499 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1500
1501 pop rbx
1502 ret
1503
1504 %endif
1505%else
1506 push esi
1507 push edi
1508 push ebx
1509 push ebp
1510
1511 mov edi, ecx ; pu64Dst
1512 mov esi, edx ; pu64EaxEdx
1513 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1514 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1515
1516 mov ebx, [ecx]
1517 mov ecx, [ecx + 4]
1518 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1519 mov eax, [esi]
1520 mov edx, [esi + 4]
1521
1522 lock cmpxchg8b [edi]
1523
1524 mov [esi], eax
1525 mov [esi + 4], edx
1526 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1527
1528 pop ebp
1529 pop ebx
1530 pop edi
1531 pop esi
1532 ret 8
1533%endif
1534ENDPROC iemAImpl_cmpxchg8b_locked
1535
1536%ifdef RT_ARCH_AMD64
1537
1538;
1539; CMPXCHG16B.
1540;
1541; These are tricky register wise, so the code is duplicated for each calling
1542; convention.
1543;
1544; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1545;
1546; C-proto:
1547; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1548; uint32_t *pEFlags));
1549;
1550; Note! Identical to iemAImpl_cmpxchg8b.
1551;
1552BEGINCODE
1553BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1554 %ifdef ASM_CALL64_MSC
1555 push rbx
1556
1557 mov r11, rdx ; pu64RaxRdx (is also T1)
1558 mov r10, rcx ; pu64Dst
1559
1560 mov rbx, [r8]
1561 mov rcx, [r8 + 8]
1562 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1563 mov rax, [r11]
1564 mov rdx, [r11 + 8]
1565
1566 cmpxchg16b [r10]
1567
1568 mov [r11], rax
1569 mov [r11 + 8], rdx
1570 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1571
1572 pop rbx
1573 ret
1574 %else
1575 push rbx
1576
1577 mov r10, rcx ; pEFlags
1578 mov r11, rdx ; pu64RbxRcx (is also T1)
1579
1580 mov rbx, [r11]
1581 mov rcx, [r11 + 8]
1582 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1583 mov rax, [rsi]
1584 mov rdx, [rsi + 8]
1585
1586 cmpxchg16b [rdi]
1587
1588 mov [rsi], rax
1589 mov [rsi + 8], rdx
1590 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1591
1592 pop rbx
1593 ret
1594
1595 %endif
1596ENDPROC iemAImpl_cmpxchg16b
1597
1598BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1599 %ifdef ASM_CALL64_MSC
1600 push rbx
1601
1602 mov r11, rdx ; pu64RaxRdx (is also T1)
1603 mov r10, rcx ; pu64Dst
1604
1605 mov rbx, [r8]
1606 mov rcx, [r8 + 8]
1607 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1608 mov rax, [r11]
1609 mov rdx, [r11 + 8]
1610
1611 lock cmpxchg16b [r10]
1612
1613 mov [r11], rax
1614 mov [r11 + 8], rdx
1615 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1616
1617 pop rbx
1618 ret
1619 %else
1620 push rbx
1621
1622 mov r10, rcx ; pEFlags
1623 mov r11, rdx ; pu64RbxRcx (is also T1)
1624
1625 mov rbx, [r11]
1626 mov rcx, [r11 + 8]
1627 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1628 mov rax, [rsi]
1629 mov rdx, [rsi + 8]
1630
1631 lock cmpxchg16b [rdi]
1632
1633 mov [rsi], rax
1634 mov [rsi + 8], rdx
1635 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1636
1637 pop rbx
1638 ret
1639
1640 %endif
1641ENDPROC iemAImpl_cmpxchg16b_locked
1642
1643%endif ; RT_ARCH_AMD64
1644
1645
1646;
1647; CMPXCHG.
1648;
1649; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1650;
1651; C-proto:
1652; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1653;
1654BEGINCODE
1655%macro IEMIMPL_CMPXCHG 2
1656BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1657 PROLOGUE_4_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1659 mov al, [A1]
1660 %1 cmpxchg [A0], A2_8
1661 mov [A1], al
1662 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1663 EPILOGUE_4_ARGS
1664ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1665
1666BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1667 PROLOGUE_4_ARGS
1668 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1669 mov ax, [A1]
1670 %1 cmpxchg [A0], A2_16
1671 mov [A1], ax
1672 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1673 EPILOGUE_4_ARGS
1674ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1675
1676BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1677 PROLOGUE_4_ARGS
1678 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1679 mov eax, [A1]
1680 %1 cmpxchg [A0], A2_32
1681 mov [A1], eax
1682 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1683 EPILOGUE_4_ARGS
1684ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1685
1686BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1687%ifdef RT_ARCH_AMD64
1688 PROLOGUE_4_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1690 mov rax, [A1]
1691 %1 cmpxchg [A0], A2
1692 mov [A1], rax
1693 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1694 EPILOGUE_4_ARGS
1695%else
1696 ;
1697 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1698 ;
1699 push esi
1700 push edi
1701 push ebx
1702 push ebp
1703
1704 mov edi, ecx ; pu64Dst
1705 mov esi, edx ; pu64Rax
1706 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1707 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1708
1709 mov ebx, [ecx]
1710 mov ecx, [ecx + 4]
1711 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1712 mov eax, [esi]
1713 mov edx, [esi + 4]
1714
1715 lock cmpxchg8b [edi]
1716
1717 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1718 jz .cmpxchg8b_not_equal
1719;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1720 cmp eax, eax ; just set the other flags.
1721.store:
1722 mov [esi], eax
1723 mov [esi + 4], edx
1724 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1725
1726 pop ebp
1727 pop ebx
1728 pop edi
1729 pop esi
1730 ret 8
1731
1732.cmpxchg8b_not_equal:
1733 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1734 jne .store
1735 cmp [esi], eax
1736 jmp .store
1737
1738%endif
1739ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1740%endmacro ; IEMIMPL_CMPXCHG
1741
1742IEMIMPL_CMPXCHG , ,
1743IEMIMPL_CMPXCHG lock, _locked
1744
1745;;
1746; Macro for implementing a unary operator.
1747;
1748; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1749; variants, except on 32-bit system where the 64-bit accesses requires hand
1750; coding.
1751;
1752; All the functions takes a pointer to the destination memory operand in A0,
1753; the source register operand in A1 and a pointer to eflags in A2.
1754;
1755; @param 1 The instruction mnemonic.
1756; @param 2 The modified flags.
1757; @param 3 The undefined flags.
1758;
1759%macro IEMIMPL_UNARY_OP 3
1760BEGINCODE
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1762 PROLOGUE_2_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1764 %1 byte [A0]
1765 IEM_SAVE_FLAGS A1, %2, %3
1766 EPILOGUE_2_ARGS
1767ENDPROC iemAImpl_ %+ %1 %+ _u8
1768
1769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1770 PROLOGUE_2_ARGS
1771 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1772 lock %1 byte [A0]
1773 IEM_SAVE_FLAGS A1, %2, %3
1774 EPILOGUE_2_ARGS
1775ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1776
1777BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1778 PROLOGUE_2_ARGS
1779 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1780 %1 word [A0]
1781 IEM_SAVE_FLAGS A1, %2, %3
1782 EPILOGUE_2_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1786 PROLOGUE_2_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1788 lock %1 word [A0]
1789 IEM_SAVE_FLAGS A1, %2, %3
1790 EPILOGUE_2_ARGS
1791ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1792
1793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1794 PROLOGUE_2_ARGS
1795 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1796 %1 dword [A0]
1797 IEM_SAVE_FLAGS A1, %2, %3
1798 EPILOGUE_2_ARGS
1799ENDPROC iemAImpl_ %+ %1 %+ _u32
1800
1801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1802 PROLOGUE_2_ARGS
1803 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1804 lock %1 dword [A0]
1805 IEM_SAVE_FLAGS A1, %2, %3
1806 EPILOGUE_2_ARGS
1807ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1808
1809 %ifdef RT_ARCH_AMD64
1810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1811 PROLOGUE_2_ARGS
1812 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1813 %1 qword [A0]
1814 IEM_SAVE_FLAGS A1, %2, %3
1815 EPILOGUE_2_ARGS
1816ENDPROC iemAImpl_ %+ %1 %+ _u64
1817
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1819 PROLOGUE_2_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1821 lock %1 qword [A0]
1822 IEM_SAVE_FLAGS A1, %2, %3
1823 EPILOGUE_2_ARGS
1824ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1825 %endif ; RT_ARCH_AMD64
1826
1827%endmacro
1828
1829IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1830IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1831IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1832IEMIMPL_UNARY_OP not, 0, 0
1833
1834
1835;
1836; BSWAP. No flag changes.
1837;
1838; Each function takes one argument, pointer to the value to bswap
1839; (input/output). They all return void.
1840;
1841BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1842 PROLOGUE_1_ARGS
1843 mov T0_32, [A0] ; just in case any of the upper bits are used.
1844 db 66h
1845 bswap T0_32
1846 mov [A0], T0_32
1847 EPILOGUE_1_ARGS
1848ENDPROC iemAImpl_bswap_u16
1849
1850BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1851 PROLOGUE_1_ARGS
1852 mov T0_32, [A0]
1853 bswap T0_32
1854 mov [A0], T0_32
1855 EPILOGUE_1_ARGS
1856ENDPROC iemAImpl_bswap_u32
1857
1858BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1859%ifdef RT_ARCH_AMD64
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 bswap T0
1863 mov [A0], T0
1864 EPILOGUE_1_ARGS
1865%else
1866 PROLOGUE_1_ARGS
1867 mov T0, [A0]
1868 mov T1, [A0 + 4]
1869 bswap T0
1870 bswap T1
1871 mov [A0 + 4], T0
1872 mov [A0], T1
1873 EPILOGUE_1_ARGS
1874%endif
1875ENDPROC iemAImpl_bswap_u64
1876
1877
1878;;
1879; Macro for implementing a shift operation.
1880;
1881; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1882; 32-bit system where the 64-bit accesses requires hand coding.
1883;
1884; All the functions takes a pointer to the destination memory operand in A0,
1885; the shift count in A1 and a pointer to eflags in A2.
1886;
1887; @param 1 The instruction mnemonic.
1888; @param 2 The modified flags.
1889; @param 3 The undefined flags.
1890; @param 4 Force load flags.
1891;
1892; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1893;
1894; @note the _intel and _amd variants are implemented in C.
1895;
1896%macro IEMIMPL_SHIFT_OP 4
1897BEGINCODE
1898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1899 PROLOGUE_3_ARGS
1900 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1901 %ifdef ASM_CALL64_GCC
1902 mov cl, A1_8
1903 %1 byte [A0], cl
1904 %else
1905 xchg A1, A0
1906 %1 byte [A1], cl
1907 %endif
1908 IEM_SAVE_FLAGS A2, %2, %3
1909 EPILOGUE_3_ARGS
1910ENDPROC iemAImpl_ %+ %1 %+ _u8
1911
1912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1913 PROLOGUE_3_ARGS
1914 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1915 %ifdef ASM_CALL64_GCC
1916 mov cl, A1_8
1917 %1 word [A0], cl
1918 %else
1919 xchg A1, A0
1920 %1 word [A1], cl
1921 %endif
1922 IEM_SAVE_FLAGS A2, %2, %3
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u16
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1927 PROLOGUE_3_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1929 %ifdef ASM_CALL64_GCC
1930 mov cl, A1_8
1931 %1 dword [A0], cl
1932 %else
1933 xchg A1, A0
1934 %1 dword [A1], cl
1935 %endif
1936 IEM_SAVE_FLAGS A2, %2, %3
1937 EPILOGUE_3_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u32
1939
1940 %ifdef RT_ARCH_AMD64
1941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1942 PROLOGUE_3_ARGS
1943 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1944 %ifdef ASM_CALL64_GCC
1945 mov cl, A1_8
1946 %1 qword [A0], cl
1947 %else
1948 xchg A1, A0
1949 %1 qword [A1], cl
1950 %endif
1951 IEM_SAVE_FLAGS A2, %2, %3
1952 EPILOGUE_3_ARGS
1953ENDPROC iemAImpl_ %+ %1 %+ _u64
1954 %endif ; RT_ARCH_AMD64
1955
1956%endmacro
1957
1958;; @todo some questions wrt flags when the shift count is high according to intel docs...
1959IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1960IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1961IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1962IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1963IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1964IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1965IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1966
1967
1968;;
1969; Macro for implementing a double precision shift operation.
1970;
1971; This will generate code for the 16, 32 and 64 bit accesses, except on
1972; 32-bit system where the 64-bit accesses requires hand coding.
1973;
1974; The functions takes the destination operand (r/m) in A0, the source (reg) in
1975; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1976;
1977; @param 1 The instruction mnemonic.
1978; @param 2 The modified flags.
1979; @param 3 The undefined flags.
1980;
1981; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1982;
1983; @note the _intel and _amd variants are implemented in C.
1984;
1985%macro IEMIMPL_SHIFT_DBL_OP 3
1986BEGINCODE
1987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1988 PROLOGUE_4_ARGS
1989 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1990 %ifdef ASM_CALL64_GCC
1991 xchg A3, A2
1992 %1 [A0], A1_16, cl
1993 xchg A3, A2
1994 %else
1995 xchg A0, A2
1996 %1 [A2], A1_16, cl
1997 %endif
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 EPILOGUE_4_ARGS
2000ENDPROC iemAImpl_ %+ %1 %+ _u16
2001
2002BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2003 PROLOGUE_4_ARGS
2004 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2005 %ifdef ASM_CALL64_GCC
2006 xchg A3, A2
2007 %1 [A0], A1_32, cl
2008 xchg A3, A2
2009 %else
2010 xchg A0, A2
2011 %1 [A2], A1_32, cl
2012 %endif
2013 IEM_SAVE_FLAGS A3, %2, %3
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32
2016
2017 %ifdef RT_ARCH_AMD64
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 %ifdef ASM_CALL64_GCC
2022 xchg A3, A2
2023 %1 [A0], A1, cl
2024 xchg A3, A2
2025 %else
2026 xchg A0, A2
2027 %1 [A2], A1, cl
2028 %endif
2029 IEM_SAVE_FLAGS A3, %2, %3
2030 EPILOGUE_4_ARGS_EX 12
2031ENDPROC iemAImpl_ %+ %1 %+ _u64
2032 %endif ; RT_ARCH_AMD64
2033
2034%endmacro
2035
2036IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2037IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2038
2039
2040;;
2041; Macro for implementing a multiplication operations.
2042;
2043; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2044; 32-bit system where the 64-bit accesses requires hand coding.
2045;
2046; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2047; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2048; pointer to eflags in A3.
2049;
2050; The functions all return 0 so the caller can be used for div/idiv as well as
2051; for the mul/imul implementation.
2052;
2053; @param 1 The instruction mnemonic.
2054; @param 2 The modified flags.
2055; @param 3 The undefined flags.
2056; @param 4 Name suffix.
2057; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2058;
2059; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2060;
2061%macro IEMIMPL_MUL_OP 5
2062BEGINCODE
2063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2064 PROLOGUE_3_ARGS
2065 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2066 mov al, [A0]
2067 %1 A1_8
2068 mov [A0], ax
2069 %if %5 != 1
2070 IEM_SAVE_FLAGS A2, %2, %3
2071 %else
2072 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2073 %endif
2074 xor eax, eax
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2077
2078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2079 PROLOGUE_4_ARGS
2080 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2081 mov ax, [A0]
2082 %ifdef ASM_CALL64_GCC
2083 %1 A2_16
2084 mov [A0], ax
2085 mov [A1], dx
2086 %else
2087 mov T1, A1
2088 %1 A2_16
2089 mov [A0], ax
2090 mov [T1], dx
2091 %endif
2092 %if %5 != 1
2093 IEM_SAVE_FLAGS A3, %2, %3
2094 %else
2095 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2096 %endif
2097 xor eax, eax
2098 EPILOGUE_4_ARGS
2099ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2100
2101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2102 PROLOGUE_4_ARGS
2103 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2104 mov eax, [A0]
2105 %ifdef ASM_CALL64_GCC
2106 %1 A2_32
2107 mov [A0], eax
2108 mov [A1], edx
2109 %else
2110 mov T1, A1
2111 %1 A2_32
2112 mov [A0], eax
2113 mov [T1], edx
2114 %endif
2115 %if %5 != 1
2116 IEM_SAVE_FLAGS A3, %2, %3
2117 %else
2118 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2119 %endif
2120 xor eax, eax
2121 EPILOGUE_4_ARGS
2122ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2123
2124 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2128 mov rax, [A0]
2129 %ifdef ASM_CALL64_GCC
2130 %1 A2
2131 mov [A0], rax
2132 mov [A1], rdx
2133 %else
2134 mov T1, A1
2135 %1 A2
2136 mov [A0], rax
2137 mov [T1], rdx
2138 %endif
2139 %if %5 != 1
2140 IEM_SAVE_FLAGS A3, %2, %3
2141 %else
2142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2143 %endif
2144 xor eax, eax
2145 EPILOGUE_4_ARGS_EX 12
2146ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2147 %endif ; !RT_ARCH_AMD64
2148
2149%endmacro
2150
2151IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2152IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2153IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2154IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157
2158
2159BEGINCODE
2160;;
2161; Worker function for negating a 32-bit number in T1:T0
2162; @uses None (T0,T1)
2163BEGINPROC iemAImpl_negate_T0_T1_u32
2164 push 0
2165 push 0
2166 xchg T0_32, [xSP]
2167 xchg T1_32, [xSP + xCB]
2168 sub T0_32, [xSP]
2169 sbb T1_32, [xSP + xCB]
2170 add xSP, xCB*2
2171 ret
2172ENDPROC iemAImpl_negate_T0_T1_u32
2173
2174%ifdef RT_ARCH_AMD64
2175;;
2176; Worker function for negating a 64-bit number in T1:T0
2177; @uses None (T0,T1)
2178BEGINPROC iemAImpl_negate_T0_T1_u64
2179 push 0
2180 push 0
2181 xchg T0, [xSP]
2182 xchg T1, [xSP + xCB]
2183 sub T0, [xSP]
2184 sbb T1, [xSP + xCB]
2185 add xSP, xCB*2
2186 ret
2187ENDPROC iemAImpl_negate_T0_T1_u64
2188%endif
2189
2190
2191;;
2192; Macro for implementing a division operations.
2193;
2194; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2195; 32-bit system where the 64-bit accesses requires hand coding.
2196;
2197; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2198; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2199; pointer to eflags in A3.
2200;
2201; The functions all return 0 on success and -1 if a divide error should be
2202; raised by the caller.
2203;
2204; @param 1 The instruction mnemonic.
2205; @param 2 The modified flags.
2206; @param 3 The undefined flags.
2207; @param 4 1 if signed, 0 if unsigned.
2208; @param 5 Function suffix.
2209; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2210; 2 for AMD (set AF, clear PF, ZF and SF).
2211;
2212; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2213;
2214%macro IEMIMPL_DIV_OP 6
2215BEGINCODE
2216BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2217 PROLOGUE_3_ARGS
2218
2219 ; div by chainsaw check.
2220 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2221 jz .div_zero
2222
2223 ; Overflow check - unsigned division is simple to verify, haven't
2224 ; found a simple way to check signed division yet unfortunately.
2225 %if %4 == 0
2226 cmp [A0 + 1], A1_8
2227 jae .div_overflow
2228 %else
2229 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2230 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2231 test A1_8, A1_8
2232 js .divisor_negative
2233 test T0_16, T0_16
2234 jns .both_positive
2235 neg T0_16
2236.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2237 push T0 ; Start off like unsigned below.
2238 shr T0_16, 7
2239 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2240 pop T0
2241 jb .div_no_overflow
2242 ja .div_overflow
2243 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2244 cmp T0_8, A1_8
2245 jae .div_overflow
2246 jmp .div_no_overflow
2247
2248.divisor_negative:
2249 neg A1_8
2250 test T0_16, T0_16
2251 jns .one_of_each
2252 neg T0_16
2253.both_positive: ; Same as unsigned shifted by sign indicator bit.
2254 shr T0_16, 7
2255 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2256 jae .div_overflow
2257.div_no_overflow:
2258 mov A1, T1 ; restore divisor
2259 %endif
2260
2261 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2262 mov ax, [A0]
2263 %1 A1_8
2264 mov [A0], ax
2265 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2266 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2267 %else
2268 IEM_SAVE_FLAGS A2, %2, %3
2269 %endif
2270 xor eax, eax
2271
2272.return:
2273 EPILOGUE_3_ARGS
2274
2275.div_zero:
2276.div_overflow:
2277 mov eax, -1
2278 jmp .return
2279ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2280
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2282 PROLOGUE_4_ARGS
2283
2284 ; div by chainsaw check.
2285 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2286 jz .div_zero
2287
2288 ; Overflow check - unsigned division is simple to verify, haven't
2289 ; found a simple way to check signed division yet unfortunately.
2290 %if %4 == 0
2291 cmp [A1], A2_16
2292 jae .div_overflow
2293 %else
2294 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2295 shl T0_32, 16
2296 mov T0_16, [A0] ; T0 = dividend
2297 mov T1, A2 ; T1 = divisor
2298 test T1_16, T1_16
2299 js .divisor_negative
2300 test T0_32, T0_32
2301 jns .both_positive
2302 neg T0_32
2303.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2304 push T0 ; Start off like unsigned below.
2305 shr T0_32, 15
2306 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2307 pop T0
2308 jb .div_no_overflow
2309 ja .div_overflow
2310 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2311 cmp T0_16, T1_16
2312 jae .div_overflow
2313 jmp .div_no_overflow
2314
2315.divisor_negative:
2316 neg T1_16
2317 test T0_32, T0_32
2318 jns .one_of_each
2319 neg T0_32
2320.both_positive: ; Same as unsigned shifted by sign indicator bit.
2321 shr T0_32, 15
2322 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2323 jae .div_overflow
2324.div_no_overflow:
2325 %endif
2326
2327 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2328 %ifdef ASM_CALL64_GCC
2329 mov T1, A2
2330 mov ax, [A0]
2331 mov dx, [A1]
2332 %1 T1_16
2333 mov [A0], ax
2334 mov [A1], dx
2335 %else
2336 mov T1, A1
2337 mov ax, [A0]
2338 mov dx, [T1]
2339 %1 A2_16
2340 mov [A0], ax
2341 mov [T1], dx
2342 %endif
2343 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2344 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2345 %else
2346 IEM_SAVE_FLAGS A3, %2, %3
2347 %endif
2348 xor eax, eax
2349
2350.return:
2351 EPILOGUE_4_ARGS
2352
2353.div_zero:
2354.div_overflow:
2355 mov eax, -1
2356 jmp .return
2357ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2358
2359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2360 PROLOGUE_4_ARGS
2361
2362 ; div by chainsaw check.
2363 test A2_32, A2_32
2364 jz .div_zero
2365
2366 ; Overflow check - unsigned division is simple to verify, haven't
2367 ; found a simple way to check signed division yet unfortunately.
2368 %if %4 == 0
2369 cmp [A1], A2_32
2370 jae .div_overflow
2371 %else
2372 push A2 ; save A2 so we modify it (we out of regs on x86).
2373 mov T0_32, [A0] ; T0 = dividend low
2374 mov T1_32, [A1] ; T1 = dividend high
2375 ;test A2_32, A2_32 - we did this 5 instructions ago.
2376 js .divisor_negative
2377 test T1_32, T1_32
2378 jns .both_positive
2379 call NAME(iemAImpl_negate_T0_T1_u32)
2380.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2381 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2382 jnz .div_overflow
2383 push T0 ; Start off like unsigned below.
2384 shl T1_32, 1
2385 shr T0_32, 31
2386 or T1_32, T0_32
2387 cmp T1_32, A2_32
2388 pop T0
2389 jb .div_no_overflow
2390 ja .div_overflow
2391 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2392 cmp T0_32, A2_32
2393 jae .div_overflow
2394 jmp .div_no_overflow
2395
2396.divisor_negative:
2397 neg A2_32
2398 test T1_32, T1_32
2399 jns .one_of_each
2400 call NAME(iemAImpl_negate_T0_T1_u32)
2401.both_positive: ; Same as unsigned shifted by sign indicator bit.
2402 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2403 jnz .div_overflow
2404 shl T1_32, 1
2405 shr T0_32, 31
2406 or T1_32, T0_32
2407 cmp T1_32, A2_32
2408 jae .div_overflow
2409.div_no_overflow:
2410 pop A2
2411 %endif
2412
2413 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2414 mov eax, [A0]
2415 %ifdef ASM_CALL64_GCC
2416 mov T1, A2
2417 mov eax, [A0]
2418 mov edx, [A1]
2419 %1 T1_32
2420 mov [A0], eax
2421 mov [A1], edx
2422 %else
2423 mov T1, A1
2424 mov eax, [A0]
2425 mov edx, [T1]
2426 %1 A2_32
2427 mov [A0], eax
2428 mov [T1], edx
2429 %endif
2430 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2431 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2432 %else
2433 IEM_SAVE_FLAGS A3, %2, %3
2434 %endif
2435 xor eax, eax
2436
2437.return:
2438 EPILOGUE_4_ARGS
2439
2440.div_overflow:
2441 %if %4 != 0
2442 pop A2
2443 %endif
2444.div_zero:
2445 mov eax, -1
2446 jmp .return
2447ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2448
2449 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2450BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2451 PROLOGUE_4_ARGS
2452
2453 test A2, A2
2454 jz .div_zero
2455 %if %4 == 0
2456 cmp [A1], A2
2457 jae .div_overflow
2458 %else
2459 push A2 ; save A2 so we modify it (we out of regs on x86).
2460 mov T0, [A0] ; T0 = dividend low
2461 mov T1, [A1] ; T1 = dividend high
2462 ;test A2, A2 - we did this five instructions above.
2463 js .divisor_negative
2464 test T1, T1
2465 jns .both_positive
2466 call NAME(iemAImpl_negate_T0_T1_u64)
2467.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2468 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2469 jc .div_overflow
2470 push T0 ; Start off like unsigned below.
2471 shl T1, 1
2472 shr T0, 63
2473 or T1, T0
2474 cmp T1, A2
2475 pop T0
2476 jb .div_no_overflow
2477 ja .div_overflow
2478 mov T1, 0x7fffffffffffffff
2479 and T0, T1 ; Special case for covering (divisor - 1).
2480 cmp T0, A2
2481 jae .div_overflow
2482 jmp .div_no_overflow
2483
2484.divisor_negative:
2485 neg A2
2486 test T1, T1
2487 jns .one_of_each
2488 call NAME(iemAImpl_negate_T0_T1_u64)
2489.both_positive: ; Same as unsigned shifted by sign indicator bit.
2490 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2491 jc .div_overflow
2492 shl T1, 1
2493 shr T0, 63
2494 or T1, T0
2495 cmp T1, A2
2496 jae .div_overflow
2497.div_no_overflow:
2498 pop A2
2499 %endif
2500
2501 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2502 mov rax, [A0]
2503 %ifdef ASM_CALL64_GCC
2504 mov T1, A2
2505 mov rax, [A0]
2506 mov rdx, [A1]
2507 %1 T1
2508 mov [A0], rax
2509 mov [A1], rdx
2510 %else
2511 mov T1, A1
2512 mov rax, [A0]
2513 mov rdx, [T1]
2514 %1 A2
2515 mov [A0], rax
2516 mov [T1], rdx
2517 %endif
2518 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2519 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2520 %else
2521 IEM_SAVE_FLAGS A3, %2, %3
2522 %endif
2523 xor eax, eax
2524
2525.return:
2526 EPILOGUE_4_ARGS_EX 12
2527
2528.div_overflow:
2529 %if %4 != 0
2530 pop A2
2531 %endif
2532.div_zero:
2533 mov eax, -1
2534 jmp .return
2535ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2536 %endif ; !RT_ARCH_AMD64
2537
2538%endmacro
2539
2540IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2541IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2542IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2543;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
2544IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2545IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2546IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2547
2548
2549;;
2550; Macro for implementing memory fence operation.
2551;
2552; No return value, no operands or anything.
2553;
2554; @param 1 The instruction.
2555;
2556%macro IEMIMPL_MEM_FENCE 1
2557BEGINCODE
2558BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2559 %1
2560 ret
2561ENDPROC iemAImpl_ %+ %1
2562%endmacro
2563
2564IEMIMPL_MEM_FENCE lfence
2565IEMIMPL_MEM_FENCE sfence
2566IEMIMPL_MEM_FENCE mfence
2567
2568;;
2569; Alternative for non-SSE2 host.
2570;
2571BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2572 push xAX
2573 xchg xAX, [xSP]
2574 add xSP, xCB
2575 ret
2576ENDPROC iemAImpl_alt_mem_fence
2577
2578
2579;;
2580; Initialize the FPU for the actual instruction being emulated, this means
2581; loading parts of the guest's control word and status word.
2582;
2583; @uses 24 bytes of stack. T0, T1
2584; @param 1 Expression giving the address of the FXSTATE of the guest.
2585;
2586%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2587 fnstenv [xSP]
2588
2589 ; FCW - for exception, precision and rounding control.
2590 movzx T0, word [%1 + X86FXSTATE.FCW]
2591 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2592 mov [xSP + X86FSTENV32P.FCW], T0_16
2593
2594 ; FSW - for undefined C0, C1, C2, and C3.
2595 movzx T1, word [%1 + X86FXSTATE.FSW]
2596 and T1, X86_FSW_C_MASK
2597 movzx T0, word [xSP + X86FSTENV32P.FSW]
2598 and T0, X86_FSW_TOP_MASK
2599 or T0, T1
2600 mov [xSP + X86FSTENV32P.FSW], T0_16
2601
2602 fldenv [xSP]
2603%endmacro
2604
2605
2606;;
2607; Initialize the FPU for the actual instruction being emulated, this means
2608; loading parts of the guest's control word, status word, and update the
2609; tag word for the top register if it's empty.
2610;
2611; ASSUMES actual TOP=7
2612;
2613; @uses 24 bytes of stack. T0, T1
2614; @param 1 Expression giving the address of the FXSTATE of the guest.
2615;
2616%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2617 fnstenv [xSP]
2618
2619 ; FCW - for exception, precision and rounding control.
2620 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2621 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2622 mov [xSP + X86FSTENV32P.FCW], T0_16
2623
2624 ; FSW - for undefined C0, C1, C2, and C3.
2625 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2626 and T1_32, X86_FSW_C_MASK
2627 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2628 and T0_32, X86_FSW_TOP_MASK
2629 or T0_32, T1_32
2630 mov [xSP + X86FSTENV32P.FSW], T0_16
2631
2632 ; FTW - Only for ST0 (in/out).
2633 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2634 shr T1_32, X86_FSW_TOP_SHIFT
2635 and T1_32, X86_FSW_TOP_SMASK
2636 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2637 jc %%st0_not_empty
2638 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2639%%st0_not_empty:
2640
2641 fldenv [xSP]
2642%endmacro
2643
2644
2645;;
2646; Need to move this as well somewhere better?
2647;
2648struc IEMFPURESULT
2649 .r80Result resw 5
2650 .FSW resw 1
2651endstruc
2652
2653
2654;;
2655; Need to move this as well somewhere better?
2656;
2657struc IEMFPURESULTTWO
2658 .r80Result1 resw 5
2659 .FSW resw 1
2660 .r80Result2 resw 5
2661endstruc
2662
2663
2664;
2665;---------------------- 16-bit signed integer operations ----------------------
2666;
2667
2668
2669;;
2670; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2671;
2672; @param A0 FPU context (fxsave).
2673; @param A1 Pointer to a IEMFPURESULT for the output.
2674; @param A2 Pointer to the 16-bit floating point value to convert.
2675;
2676BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2677 PROLOGUE_3_ARGS
2678 sub xSP, 20h
2679
2680 fninit
2681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2682 fild word [A2]
2683
2684 fnstsw word [A1 + IEMFPURESULT.FSW]
2685 fnclex
2686 fstp tword [A1 + IEMFPURESULT.r80Result]
2687
2688 fninit
2689 add xSP, 20h
2690 EPILOGUE_3_ARGS
2691ENDPROC iemAImpl_fild_r80_from_i16
2692
2693
2694;;
2695; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2696;
2697; @param A0 FPU context (fxsave).
2698; @param A1 Where to return the output FSW.
2699; @param A2 Where to store the 16-bit signed integer value.
2700; @param A3 Pointer to the 80-bit value.
2701;
2702BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2703 PROLOGUE_4_ARGS
2704 sub xSP, 20h
2705
2706 fninit
2707 fld tword [A3]
2708 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2709 fistp word [A2]
2710
2711 fnstsw word [A1]
2712
2713 fninit
2714 add xSP, 20h
2715 EPILOGUE_4_ARGS
2716ENDPROC iemAImpl_fist_r80_to_i16
2717
2718
2719;;
2720; Store a 80-bit floating point value (register) as a 16-bit signed integer
2721; (memory) with truncation.
2722;
2723; @param A0 FPU context (fxsave).
2724; @param A1 Where to return the output FSW.
2725; @param A2 Where to store the 16-bit signed integer value.
2726; @param A3 Pointer to the 80-bit value.
2727;
2728BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2729 PROLOGUE_4_ARGS
2730 sub xSP, 20h
2731
2732 fninit
2733 fld tword [A3]
2734 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2735 fisttp word [A2]
2736
2737 fnstsw word [A1]
2738
2739 fninit
2740 add xSP, 20h
2741 EPILOGUE_4_ARGS
2742ENDPROC iemAImpl_fistt_r80_to_i16
2743
2744
2745;;
2746; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2747;
2748; @param 1 The instruction
2749;
2750; @param A0 FPU context (fxsave).
2751; @param A1 Pointer to a IEMFPURESULT for the output.
2752; @param A2 Pointer to the 80-bit value.
2753; @param A3 Pointer to the 16-bit value.
2754;
2755%macro IEMIMPL_FPU_R80_BY_I16 1
2756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2757 PROLOGUE_4_ARGS
2758 sub xSP, 20h
2759
2760 fninit
2761 fld tword [A2]
2762 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2763 %1 word [A3]
2764
2765 fnstsw word [A1 + IEMFPURESULT.FSW]
2766 fnclex
2767 fstp tword [A1 + IEMFPURESULT.r80Result]
2768
2769 fninit
2770 add xSP, 20h
2771 EPILOGUE_4_ARGS
2772ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2773%endmacro
2774
2775IEMIMPL_FPU_R80_BY_I16 fiadd
2776IEMIMPL_FPU_R80_BY_I16 fimul
2777IEMIMPL_FPU_R80_BY_I16 fisub
2778IEMIMPL_FPU_R80_BY_I16 fisubr
2779IEMIMPL_FPU_R80_BY_I16 fidiv
2780IEMIMPL_FPU_R80_BY_I16 fidivr
2781
2782
2783;;
2784; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2785; only returning FSW.
2786;
2787; @param 1 The instruction
2788;
2789; @param A0 FPU context (fxsave).
2790; @param A1 Where to store the output FSW.
2791; @param A2 Pointer to the 80-bit value.
2792; @param A3 Pointer to the 64-bit value.
2793;
2794%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2796 PROLOGUE_4_ARGS
2797 sub xSP, 20h
2798
2799 fninit
2800 fld tword [A2]
2801 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2802 %1 word [A3]
2803
2804 fnstsw word [A1]
2805
2806 fninit
2807 add xSP, 20h
2808 EPILOGUE_4_ARGS
2809ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2810%endmacro
2811
2812IEMIMPL_FPU_R80_BY_I16_FSW ficom
2813
2814
2815
2816;
2817;---------------------- 32-bit signed integer operations ----------------------
2818;
2819
2820
2821;;
2822; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2823;
2824; @param A0 FPU context (fxsave).
2825; @param A1 Pointer to a IEMFPURESULT for the output.
2826; @param A2 Pointer to the 32-bit floating point value to convert.
2827;
2828BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2829 PROLOGUE_3_ARGS
2830 sub xSP, 20h
2831
2832 fninit
2833 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2834 fild dword [A2]
2835
2836 fnstsw word [A1 + IEMFPURESULT.FSW]
2837 fnclex
2838 fstp tword [A1 + IEMFPURESULT.r80Result]
2839
2840 fninit
2841 add xSP, 20h
2842 EPILOGUE_3_ARGS
2843ENDPROC iemAImpl_fild_r80_from_i32
2844
2845
2846;;
2847; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2848;
2849; @param A0 FPU context (fxsave).
2850; @param A1 Where to return the output FSW.
2851; @param A2 Where to store the 32-bit signed integer value.
2852; @param A3 Pointer to the 80-bit value.
2853;
2854BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2855 PROLOGUE_4_ARGS
2856 sub xSP, 20h
2857
2858 fninit
2859 fld tword [A3]
2860 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2861 fistp dword [A2]
2862
2863 fnstsw word [A1]
2864
2865 fninit
2866 add xSP, 20h
2867 EPILOGUE_4_ARGS
2868ENDPROC iemAImpl_fist_r80_to_i32
2869
2870
2871;;
2872; Store a 80-bit floating point value (register) as a 32-bit signed integer
2873; (memory) with truncation.
2874;
2875; @param A0 FPU context (fxsave).
2876; @param A1 Where to return the output FSW.
2877; @param A2 Where to store the 32-bit signed integer value.
2878; @param A3 Pointer to the 80-bit value.
2879;
2880BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2881 PROLOGUE_4_ARGS
2882 sub xSP, 20h
2883
2884 fninit
2885 fld tword [A3]
2886 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2887 fisttp dword [A2]
2888
2889 fnstsw word [A1]
2890
2891 fninit
2892 add xSP, 20h
2893 EPILOGUE_4_ARGS
2894ENDPROC iemAImpl_fistt_r80_to_i32
2895
2896
2897;;
2898; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2899;
2900; @param 1 The instruction
2901;
2902; @param A0 FPU context (fxsave).
2903; @param A1 Pointer to a IEMFPURESULT for the output.
2904; @param A2 Pointer to the 80-bit value.
2905; @param A3 Pointer to the 32-bit value.
2906;
2907%macro IEMIMPL_FPU_R80_BY_I32 1
2908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2909 PROLOGUE_4_ARGS
2910 sub xSP, 20h
2911
2912 fninit
2913 fld tword [A2]
2914 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2915 %1 dword [A3]
2916
2917 fnstsw word [A1 + IEMFPURESULT.FSW]
2918 fnclex
2919 fstp tword [A1 + IEMFPURESULT.r80Result]
2920
2921 fninit
2922 add xSP, 20h
2923 EPILOGUE_4_ARGS
2924ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2925%endmacro
2926
2927IEMIMPL_FPU_R80_BY_I32 fiadd
2928IEMIMPL_FPU_R80_BY_I32 fimul
2929IEMIMPL_FPU_R80_BY_I32 fisub
2930IEMIMPL_FPU_R80_BY_I32 fisubr
2931IEMIMPL_FPU_R80_BY_I32 fidiv
2932IEMIMPL_FPU_R80_BY_I32 fidivr
2933
2934
2935;;
2936; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2937; only returning FSW.
2938;
2939; @param 1 The instruction
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Where to store the output FSW.
2943; @param A2 Pointer to the 80-bit value.
2944; @param A3 Pointer to the 64-bit value.
2945;
2946%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2947BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2948 PROLOGUE_4_ARGS
2949 sub xSP, 20h
2950
2951 fninit
2952 fld tword [A2]
2953 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2954 %1 dword [A3]
2955
2956 fnstsw word [A1]
2957
2958 fninit
2959 add xSP, 20h
2960 EPILOGUE_4_ARGS
2961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2962%endmacro
2963
2964IEMIMPL_FPU_R80_BY_I32_FSW ficom
2965
2966
2967
2968;
2969;---------------------- 64-bit signed integer operations ----------------------
2970;
2971
2972
2973;;
2974; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 64-bit floating point value to convert.
2979;
2980BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2981 PROLOGUE_3_ARGS
2982 sub xSP, 20h
2983
2984 fninit
2985 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2986 fild qword [A2]
2987
2988 fnstsw word [A1 + IEMFPURESULT.FSW]
2989 fnclex
2990 fstp tword [A1 + IEMFPURESULT.r80Result]
2991
2992 fninit
2993 add xSP, 20h
2994 EPILOGUE_3_ARGS
2995ENDPROC iemAImpl_fild_r80_from_i64
2996
2997
2998;;
2999; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3000;
3001; @param A0 FPU context (fxsave).
3002; @param A1 Where to return the output FSW.
3003; @param A2 Where to store the 64-bit signed integer value.
3004; @param A3 Pointer to the 80-bit value.
3005;
3006BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3007 PROLOGUE_4_ARGS
3008 sub xSP, 20h
3009
3010 fninit
3011 fld tword [A3]
3012 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013 fistp qword [A2]
3014
3015 fnstsw word [A1]
3016
3017 fninit
3018 add xSP, 20h
3019 EPILOGUE_4_ARGS
3020ENDPROC iemAImpl_fist_r80_to_i64
3021
3022
3023;;
3024; Store a 80-bit floating point value (register) as a 64-bit signed integer
3025; (memory) with truncation.
3026;
3027; @param A0 FPU context (fxsave).
3028; @param A1 Where to return the output FSW.
3029; @param A2 Where to store the 64-bit signed integer value.
3030; @param A3 Pointer to the 80-bit value.
3031;
3032BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3033 PROLOGUE_4_ARGS
3034 sub xSP, 20h
3035
3036 fninit
3037 fld tword [A3]
3038 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3039 fisttp qword [A2]
3040
3041 fnstsw word [A1]
3042
3043 fninit
3044 add xSP, 20h
3045 EPILOGUE_4_ARGS
3046ENDPROC iemAImpl_fistt_r80_to_i64
3047
3048
3049
3050;
3051;---------------------- 32-bit floating point operations ----------------------
3052;
3053
3054;;
3055; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3056;
3057; @param A0 FPU context (fxsave).
3058; @param A1 Pointer to a IEMFPURESULT for the output.
3059; @param A2 Pointer to the 32-bit floating point value to convert.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3062 PROLOGUE_3_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3067 fld dword [A2]
3068
3069 fnstsw word [A1 + IEMFPURESULT.FSW]
3070 fnclex
3071 fstp tword [A1 + IEMFPURESULT.r80Result]
3072
3073 fninit
3074 add xSP, 20h
3075 EPILOGUE_3_ARGS
3076ENDPROC iemAImpl_fld_r80_from_r32
3077
3078
3079;;
3080; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3081;
3082; @param A0 FPU context (fxsave).
3083; @param A1 Where to return the output FSW.
3084; @param A2 Where to store the 32-bit value.
3085; @param A3 Pointer to the 80-bit value.
3086;
3087BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3088 PROLOGUE_4_ARGS
3089 sub xSP, 20h
3090
3091 fninit
3092 fld tword [A3]
3093 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3094 fst dword [A2]
3095
3096 fnstsw word [A1]
3097
3098 fninit
3099 add xSP, 20h
3100 EPILOGUE_4_ARGS
3101ENDPROC iemAImpl_fst_r80_to_r32
3102
3103
3104;;
3105; FPU instruction working on one 80-bit and one 32-bit floating point value.
3106;
3107; @param 1 The instruction
3108;
3109; @param A0 FPU context (fxsave).
3110; @param A1 Pointer to a IEMFPURESULT for the output.
3111; @param A2 Pointer to the 80-bit value.
3112; @param A3 Pointer to the 32-bit value.
3113;
3114%macro IEMIMPL_FPU_R80_BY_R32 1
3115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3116 PROLOGUE_4_ARGS
3117 sub xSP, 20h
3118
3119 fninit
3120 fld tword [A2]
3121 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3122 %1 dword [A3]
3123
3124 fnstsw word [A1 + IEMFPURESULT.FSW]
3125 fnclex
3126 fstp tword [A1 + IEMFPURESULT.r80Result]
3127
3128 fninit
3129 add xSP, 20h
3130 EPILOGUE_4_ARGS
3131ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3132%endmacro
3133
3134IEMIMPL_FPU_R80_BY_R32 fadd
3135IEMIMPL_FPU_R80_BY_R32 fmul
3136IEMIMPL_FPU_R80_BY_R32 fsub
3137IEMIMPL_FPU_R80_BY_R32 fsubr
3138IEMIMPL_FPU_R80_BY_R32 fdiv
3139IEMIMPL_FPU_R80_BY_R32 fdivr
3140
3141
3142;;
3143; FPU instruction working on one 80-bit and one 32-bit floating point value,
3144; only returning FSW.
3145;
3146; @param 1 The instruction
3147;
3148; @param A0 FPU context (fxsave).
3149; @param A1 Where to store the output FSW.
3150; @param A2 Pointer to the 80-bit value.
3151; @param A3 Pointer to the 64-bit value.
3152;
3153%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3154BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3155 PROLOGUE_4_ARGS
3156 sub xSP, 20h
3157
3158 fninit
3159 fld tword [A2]
3160 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3161 %1 dword [A3]
3162
3163 fnstsw word [A1]
3164
3165 fninit
3166 add xSP, 20h
3167 EPILOGUE_4_ARGS
3168ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3169%endmacro
3170
3171IEMIMPL_FPU_R80_BY_R32_FSW fcom
3172
3173
3174
3175;
3176;---------------------- 64-bit floating point operations ----------------------
3177;
3178
3179;;
3180; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3181;
3182; @param A0 FPU context (fxsave).
3183; @param A1 Pointer to a IEMFPURESULT for the output.
3184; @param A2 Pointer to the 64-bit floating point value to convert.
3185;
3186BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3187 PROLOGUE_3_ARGS
3188 sub xSP, 20h
3189
3190 fninit
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fld qword [A2]
3193
3194 fnstsw word [A1 + IEMFPURESULT.FSW]
3195 fnclex
3196 fstp tword [A1 + IEMFPURESULT.r80Result]
3197
3198 fninit
3199 add xSP, 20h
3200 EPILOGUE_3_ARGS
3201ENDPROC iemAImpl_fld_r80_from_r64
3202
3203
3204;;
3205; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3206;
3207; @param A0 FPU context (fxsave).
3208; @param A1 Where to return the output FSW.
3209; @param A2 Where to store the 64-bit value.
3210; @param A3 Pointer to the 80-bit value.
3211;
3212BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3213 PROLOGUE_4_ARGS
3214 sub xSP, 20h
3215
3216 fninit
3217 fld tword [A3]
3218 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3219 fst qword [A2]
3220
3221 fnstsw word [A1]
3222
3223 fninit
3224 add xSP, 20h
3225 EPILOGUE_4_ARGS
3226ENDPROC iemAImpl_fst_r80_to_r64
3227
3228
3229;;
3230; FPU instruction working on one 80-bit and one 64-bit floating point value.
3231;
3232; @param 1 The instruction
3233;
3234; @param A0 FPU context (fxsave).
3235; @param A1 Pointer to a IEMFPURESULT for the output.
3236; @param A2 Pointer to the 80-bit value.
3237; @param A3 Pointer to the 64-bit value.
3238;
3239%macro IEMIMPL_FPU_R80_BY_R64 1
3240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3241 PROLOGUE_4_ARGS
3242 sub xSP, 20h
3243
3244 fninit
3245 fld tword [A2]
3246 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3247 %1 qword [A3]
3248
3249 fnstsw word [A1 + IEMFPURESULT.FSW]
3250 fnclex
3251 fstp tword [A1 + IEMFPURESULT.r80Result]
3252
3253 fninit
3254 add xSP, 20h
3255 EPILOGUE_4_ARGS
3256ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3257%endmacro
3258
3259IEMIMPL_FPU_R80_BY_R64 fadd
3260IEMIMPL_FPU_R80_BY_R64 fmul
3261IEMIMPL_FPU_R80_BY_R64 fsub
3262IEMIMPL_FPU_R80_BY_R64 fsubr
3263IEMIMPL_FPU_R80_BY_R64 fdiv
3264IEMIMPL_FPU_R80_BY_R64 fdivr
3265
3266;;
3267; FPU instruction working on one 80-bit and one 64-bit floating point value,
3268; only returning FSW.
3269;
3270; @param 1 The instruction
3271;
3272; @param A0 FPU context (fxsave).
3273; @param A1 Where to store the output FSW.
3274; @param A2 Pointer to the 80-bit value.
3275; @param A3 Pointer to the 64-bit value.
3276;
3277%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3279 PROLOGUE_4_ARGS
3280 sub xSP, 20h
3281
3282 fninit
3283 fld tword [A2]
3284 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3285 %1 qword [A3]
3286
3287 fnstsw word [A1]
3288
3289 fninit
3290 add xSP, 20h
3291 EPILOGUE_4_ARGS
3292ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3293%endmacro
3294
3295IEMIMPL_FPU_R80_BY_R64_FSW fcom
3296
3297
3298
3299;
3300;---------------------- 80-bit floating point operations ----------------------
3301;
3302
3303;;
3304; Loads a 80-bit floating point register value from memory.
3305;
3306; @param A0 FPU context (fxsave).
3307; @param A1 Pointer to a IEMFPURESULT for the output.
3308; @param A2 Pointer to the 80-bit floating point value to load.
3309;
3310BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3311 PROLOGUE_3_ARGS
3312 sub xSP, 20h
3313
3314 fninit
3315 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3316 fld tword [A2]
3317
3318 fnstsw word [A1 + IEMFPURESULT.FSW]
3319 fnclex
3320 fstp tword [A1 + IEMFPURESULT.r80Result]
3321
3322 fninit
3323 add xSP, 20h
3324 EPILOGUE_3_ARGS
3325ENDPROC iemAImpl_fld_r80_from_r80
3326
3327
3328;;
3329; Store a 80-bit floating point register to memory
3330;
3331; @param A0 FPU context (fxsave).
3332; @param A1 Where to return the output FSW.
3333; @param A2 Where to store the 80-bit value.
3334; @param A3 Pointer to the 80-bit register value.
3335;
3336BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3337 PROLOGUE_4_ARGS
3338 sub xSP, 20h
3339
3340 fninit
3341 fld tword [A3]
3342 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3343 fstp tword [A2]
3344
3345 fnstsw word [A1]
3346
3347 fninit
3348 add xSP, 20h
3349 EPILOGUE_4_ARGS
3350ENDPROC iemAImpl_fst_r80_to_r80
3351
3352
3353;;
3354; Loads an 80-bit floating point register value in BCD format from memory.
3355;
3356; @param A0 FPU context (fxsave).
3357; @param A1 Pointer to a IEMFPURESULT for the output.
3358; @param A2 Pointer to the 80-bit BCD value to load.
3359;
3360BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3361 PROLOGUE_3_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3366 fbld tword [A2]
3367
3368 fnstsw word [A1 + IEMFPURESULT.FSW]
3369 fnclex
3370 fstp tword [A1 + IEMFPURESULT.r80Result]
3371
3372 fninit
3373 add xSP, 20h
3374 EPILOGUE_3_ARGS
3375ENDPROC iemAImpl_fld_r80_from_d80
3376
3377
3378;;
3379; Store a 80-bit floating point register to memory as BCD
3380;
3381; @param A0 FPU context (fxsave).
3382; @param A1 Where to return the output FSW.
3383; @param A2 Where to store the 80-bit BCD value.
3384; @param A3 Pointer to the 80-bit register value.
3385;
3386BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3387 PROLOGUE_4_ARGS
3388 sub xSP, 20h
3389
3390 fninit
3391 fld tword [A3]
3392 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3393 fbstp tword [A2]
3394
3395 fnstsw word [A1]
3396
3397 fninit
3398 add xSP, 20h
3399 EPILOGUE_4_ARGS
3400ENDPROC iemAImpl_fst_r80_to_d80
3401
3402
3403;;
3404; FPU instruction working on two 80-bit floating point values.
3405;
3406; @param 1 The instruction
3407;
3408; @param A0 FPU context (fxsave).
3409; @param A1 Pointer to a IEMFPURESULT for the output.
3410; @param A2 Pointer to the first 80-bit value (ST0)
3411; @param A3 Pointer to the second 80-bit value (STn).
3412;
3413%macro IEMIMPL_FPU_R80_BY_R80 2
3414BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3415 PROLOGUE_4_ARGS
3416 sub xSP, 20h
3417
3418 fninit
3419 fld tword [A3]
3420 fld tword [A2]
3421 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3422 %1 %2
3423
3424 fnstsw word [A1 + IEMFPURESULT.FSW]
3425 fnclex
3426 fstp tword [A1 + IEMFPURESULT.r80Result]
3427
3428 fninit
3429 add xSP, 20h
3430 EPILOGUE_4_ARGS
3431ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3432%endmacro
3433
3434IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3435IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3436IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3437IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3438IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3439IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3440IEMIMPL_FPU_R80_BY_R80 fprem, {}
3441IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3442IEMIMPL_FPU_R80_BY_R80 fscale, {}
3443
3444
3445;;
3446; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3447; storing the result in ST1 and popping the stack.
3448;
3449; @param 1 The instruction
3450;
3451; @param A0 FPU context (fxsave).
3452; @param A1 Pointer to a IEMFPURESULT for the output.
3453; @param A2 Pointer to the first 80-bit value (ST1).
3454; @param A3 Pointer to the second 80-bit value (ST0).
3455;
3456%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3458 PROLOGUE_4_ARGS
3459 sub xSP, 20h
3460
3461 fninit
3462 fld tword [A2]
3463 fld tword [A3]
3464 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3465 %1
3466
3467 fnstsw word [A1 + IEMFPURESULT.FSW]
3468 fnclex
3469 fstp tword [A1 + IEMFPURESULT.r80Result]
3470
3471 fninit
3472 add xSP, 20h
3473 EPILOGUE_4_ARGS
3474ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3475%endmacro
3476
3477IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3478IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3479IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3480
3481
3482;;
3483; FPU instruction working on two 80-bit floating point values, only
3484; returning FSW.
3485;
3486; @param 1 The instruction
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Pointer to a uint16_t for the resulting FSW.
3490; @param A2 Pointer to the first 80-bit value.
3491; @param A3 Pointer to the second 80-bit value.
3492;
3493%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3495 PROLOGUE_4_ARGS
3496 sub xSP, 20h
3497
3498 fninit
3499 fld tword [A3]
3500 fld tword [A2]
3501 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3502 %1 st0, st1
3503
3504 fnstsw word [A1]
3505
3506 fninit
3507 add xSP, 20h
3508 EPILOGUE_4_ARGS
3509ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3510%endmacro
3511
3512IEMIMPL_FPU_R80_BY_R80_FSW fcom
3513IEMIMPL_FPU_R80_BY_R80_FSW fucom
3514
3515
3516;;
3517; FPU instruction working on two 80-bit floating point values,
3518; returning FSW and EFLAGS (eax).
3519;
3520; @param 1 The instruction
3521;
3522; @returns EFLAGS in EAX.
3523; @param A0 FPU context (fxsave).
3524; @param A1 Pointer to a uint16_t for the resulting FSW.
3525; @param A2 Pointer to the first 80-bit value.
3526; @param A3 Pointer to the second 80-bit value.
3527;
3528%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3529BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3530 PROLOGUE_4_ARGS
3531 sub xSP, 20h
3532
3533 fninit
3534 fld tword [A3]
3535 fld tword [A2]
3536 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3537 %1 st1
3538
3539 fnstsw word [A1]
3540 pushf
3541 pop xAX
3542
3543 fninit
3544 add xSP, 20h
3545 EPILOGUE_4_ARGS
3546ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3547%endmacro
3548
3549IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3550IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3551
3552
3553;;
3554; FPU instruction working on one 80-bit floating point value.
3555;
3556; @param 1 The instruction
3557;
3558; @param A0 FPU context (fxsave).
3559; @param A1 Pointer to a IEMFPURESULT for the output.
3560; @param A2 Pointer to the 80-bit value.
3561;
3562%macro IEMIMPL_FPU_R80 1
3563BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3564 PROLOGUE_3_ARGS
3565 sub xSP, 20h
3566
3567 fninit
3568 fld tword [A2]
3569 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3570 %1
3571
3572 fnstsw word [A1 + IEMFPURESULT.FSW]
3573 fnclex
3574 fstp tword [A1 + IEMFPURESULT.r80Result]
3575
3576 fninit
3577 add xSP, 20h
3578 EPILOGUE_3_ARGS
3579ENDPROC iemAImpl_ %+ %1 %+ _r80
3580%endmacro
3581
3582IEMIMPL_FPU_R80 fchs
3583IEMIMPL_FPU_R80 fabs
3584IEMIMPL_FPU_R80 f2xm1
3585IEMIMPL_FPU_R80 fsqrt
3586IEMIMPL_FPU_R80 frndint
3587IEMIMPL_FPU_R80 fsin
3588IEMIMPL_FPU_R80 fcos
3589
3590
3591;;
3592; FPU instruction working on one 80-bit floating point value, only
3593; returning FSW.
3594;
3595; @param 1 The instruction
3596; @param 2 Non-zero to also restore FTW.
3597;
3598; @param A0 FPU context (fxsave).
3599; @param A1 Pointer to a uint16_t for the resulting FSW.
3600; @param A2 Pointer to the 80-bit value.
3601;
3602%macro IEMIMPL_FPU_R80_FSW 2
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3604 PROLOGUE_3_ARGS
3605 sub xSP, 20h
3606
3607 fninit
3608 fld tword [A2]
3609%if %2 != 0
3610 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3611%else
3612 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3613%endif
3614 %1
3615
3616 fnstsw word [A1]
3617
3618 fninit
3619 add xSP, 20h
3620 EPILOGUE_3_ARGS
3621ENDPROC iemAImpl_ %+ %1 %+ _r80
3622%endmacro
3623
3624IEMIMPL_FPU_R80_FSW ftst, 0
3625IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3626
3627
3628
3629;;
3630; FPU instruction loading a 80-bit floating point constant.
3631;
3632; @param 1 The instruction
3633;
3634; @param A0 FPU context (fxsave).
3635; @param A1 Pointer to a IEMFPURESULT for the output.
3636;
3637%macro IEMIMPL_FPU_R80_CONST 1
3638BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3639 PROLOGUE_2_ARGS
3640 sub xSP, 20h
3641
3642 fninit
3643 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3644 %1
3645
3646 fnstsw word [A1 + IEMFPURESULT.FSW]
3647 fnclex
3648 fstp tword [A1 + IEMFPURESULT.r80Result]
3649
3650 fninit
3651 add xSP, 20h
3652 EPILOGUE_2_ARGS
3653ENDPROC iemAImpl_ %+ %1 %+
3654%endmacro
3655
3656IEMIMPL_FPU_R80_CONST fld1
3657IEMIMPL_FPU_R80_CONST fldl2t
3658IEMIMPL_FPU_R80_CONST fldl2e
3659IEMIMPL_FPU_R80_CONST fldpi
3660IEMIMPL_FPU_R80_CONST fldlg2
3661IEMIMPL_FPU_R80_CONST fldln2
3662IEMIMPL_FPU_R80_CONST fldz
3663
3664
3665;;
3666; FPU instruction working on one 80-bit floating point value, outputing two.
3667;
3668; @param 1 The instruction
3669;
3670; @param A0 FPU context (fxsave).
3671; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3672; @param A2 Pointer to the 80-bit value.
3673;
3674%macro IEMIMPL_FPU_R80_R80 1
3675BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3676 PROLOGUE_3_ARGS
3677 sub xSP, 20h
3678
3679 fninit
3680 fld tword [A2]
3681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3682 %1
3683
3684 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3685 fnclex
3686 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3694%endmacro
3695
3696IEMIMPL_FPU_R80_R80 fptan
3697IEMIMPL_FPU_R80_R80 fxtract
3698IEMIMPL_FPU_R80_R80 fsincos
3699
3700
3701
3702
3703;---------------------- SSE and MMX Operations ----------------------
3704
3705;; @todo what do we need to do for MMX?
3706%macro IEMIMPL_MMX_PROLOGUE 0
3707%endmacro
3708%macro IEMIMPL_MMX_EPILOGUE 0
3709%endmacro
3710
3711;; @todo what do we need to do for SSE?
3712%macro IEMIMPL_SSE_PROLOGUE 0
3713%endmacro
3714%macro IEMIMPL_SSE_EPILOGUE 0
3715%endmacro
3716
3717;; @todo what do we need to do for AVX?
3718%macro IEMIMPL_AVX_PROLOGUE 0
3719%endmacro
3720%macro IEMIMPL_AVX_EPILOGUE 0
3721%endmacro
3722
3723
3724;;
3725; Media instruction working on two full sized registers.
3726;
3727; @param 1 The instruction
3728; @param 2 Whether there is an MMX variant (1) or not (0).
3729;
3730; @param A0 FPU context (fxsave).
3731; @param A1 Pointer to the first media register size operand (input/output).
3732; @param A2 Pointer to the second media register size operand (input).
3733;
3734%macro IEMIMPL_MEDIA_F2 2
3735%if %2 != 0
3736BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3737 PROLOGUE_3_ARGS
3738 IEMIMPL_MMX_PROLOGUE
3739
3740 movq mm0, [A1]
3741 movq mm1, [A2]
3742 %1 mm0, mm1
3743 movq [A1], mm0
3744
3745 IEMIMPL_MMX_EPILOGUE
3746 EPILOGUE_3_ARGS
3747ENDPROC iemAImpl_ %+ %1 %+ _u64
3748%endif
3749
3750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3751 PROLOGUE_3_ARGS
3752 IEMIMPL_SSE_PROLOGUE
3753
3754 movdqu xmm0, [A1]
3755 movdqu xmm1, [A2]
3756 %1 xmm0, xmm1
3757 movdqu [A1], xmm0
3758
3759 IEMIMPL_SSE_EPILOGUE
3760 EPILOGUE_3_ARGS
3761ENDPROC iemAImpl_ %+ %1 %+ _u128
3762%endmacro
3763
3764IEMIMPL_MEDIA_F2 pshufb, 1
3765IEMIMPL_MEDIA_F2 pand, 1
3766IEMIMPL_MEDIA_F2 pandn, 1
3767IEMIMPL_MEDIA_F2 por, 1
3768IEMIMPL_MEDIA_F2 pxor, 1
3769IEMIMPL_MEDIA_F2 pcmpeqb, 1
3770IEMIMPL_MEDIA_F2 pcmpeqw, 1
3771IEMIMPL_MEDIA_F2 pcmpeqd, 1
3772IEMIMPL_MEDIA_F2 pcmpeqq, 0
3773IEMIMPL_MEDIA_F2 pcmpgtb, 1
3774IEMIMPL_MEDIA_F2 pcmpgtw, 1
3775IEMIMPL_MEDIA_F2 pcmpgtd, 1
3776IEMIMPL_MEDIA_F2 pcmpgtq, 0
3777IEMIMPL_MEDIA_F2 paddb, 1
3778IEMIMPL_MEDIA_F2 paddw, 1
3779IEMIMPL_MEDIA_F2 paddd, 1
3780IEMIMPL_MEDIA_F2 paddq, 1
3781IEMIMPL_MEDIA_F2 paddsb, 1
3782IEMIMPL_MEDIA_F2 paddsw, 1
3783IEMIMPL_MEDIA_F2 paddusb, 1
3784IEMIMPL_MEDIA_F2 paddusw, 1
3785IEMIMPL_MEDIA_F2 psubb, 1
3786IEMIMPL_MEDIA_F2 psubw, 1
3787IEMIMPL_MEDIA_F2 psubd, 1
3788IEMIMPL_MEDIA_F2 psubq, 1
3789IEMIMPL_MEDIA_F2 psubsb, 1
3790IEMIMPL_MEDIA_F2 psubsw, 1
3791IEMIMPL_MEDIA_F2 psubusb, 1
3792IEMIMPL_MEDIA_F2 psubusw, 1
3793IEMIMPL_MEDIA_F2 pmullw, 1
3794IEMIMPL_MEDIA_F2 pmulld, 0
3795IEMIMPL_MEDIA_F2 pmulhw, 1
3796IEMIMPL_MEDIA_F2 pmaddwd, 1
3797IEMIMPL_MEDIA_F2 pminub, 1
3798IEMIMPL_MEDIA_F2 pminuw, 0
3799IEMIMPL_MEDIA_F2 pminud, 0
3800IEMIMPL_MEDIA_F2 pminsb, 0
3801IEMIMPL_MEDIA_F2 pminsw, 1
3802IEMIMPL_MEDIA_F2 pminsd, 0
3803IEMIMPL_MEDIA_F2 pmaxub, 1
3804IEMIMPL_MEDIA_F2 pmaxuw, 0
3805IEMIMPL_MEDIA_F2 pmaxud, 0
3806IEMIMPL_MEDIA_F2 pmaxsb, 0
3807IEMIMPL_MEDIA_F2 pmaxsw, 1
3808IEMIMPL_MEDIA_F2 pmaxsd, 0
3809IEMIMPL_MEDIA_F2 pabsb, 1
3810IEMIMPL_MEDIA_F2 pabsw, 1
3811IEMIMPL_MEDIA_F2 pabsd, 1
3812IEMIMPL_MEDIA_F2 psignb, 1
3813IEMIMPL_MEDIA_F2 psignw, 1
3814IEMIMPL_MEDIA_F2 psignd, 1
3815IEMIMPL_MEDIA_F2 phaddw, 1
3816IEMIMPL_MEDIA_F2 phaddd, 1
3817IEMIMPL_MEDIA_F2 phsubw, 1
3818IEMIMPL_MEDIA_F2 phsubd, 1
3819IEMIMPL_MEDIA_F2 phaddsw, 1
3820IEMIMPL_MEDIA_F2 phsubsw, 1
3821IEMIMPL_MEDIA_F2 pmaddubsw, 1
3822IEMIMPL_MEDIA_F2 pmulhrsw, 1
3823IEMIMPL_MEDIA_F2 pmuludq, 1
3824
3825
3826;;
3827; Media instruction working on two full sized registers, but no FXSAVE state argument.
3828;
3829; @param 1 The instruction
3830; @param 2 Whether there is an MMX variant (1) or not (0).
3831;
3832; @param A0 Pointer to the first media register size operand (input/output).
3833; @param A1 Pointer to the second media register size operand (input).
3834;
3835%macro IEMIMPL_MEDIA_OPT_F2 2
3836%if %2 != 0
3837BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3838 PROLOGUE_2_ARGS
3839 IEMIMPL_MMX_PROLOGUE
3840
3841 movq mm0, [A0]
3842 movq mm1, [A1]
3843 %1 mm0, mm1
3844 movq [A0], mm0
3845
3846 IEMIMPL_MMX_EPILOGUE
3847 EPILOGUE_2_ARGS
3848ENDPROC iemAImpl_ %+ %1 %+ _u64
3849%endif
3850
3851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3852 PROLOGUE_2_ARGS
3853 IEMIMPL_SSE_PROLOGUE
3854
3855 movdqu xmm0, [A0]
3856 movdqu xmm1, [A1]
3857 %1 xmm0, xmm1
3858 movdqu [A0], xmm0
3859
3860 IEMIMPL_SSE_EPILOGUE
3861 EPILOGUE_2_ARGS
3862ENDPROC iemAImpl_ %+ %1 %+ _u128
3863%endmacro
3864
3865IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3866IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3867IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3868IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3869IEMIMPL_MEDIA_OPT_F2 psllw, 1
3870IEMIMPL_MEDIA_OPT_F2 pslld, 1
3871IEMIMPL_MEDIA_OPT_F2 psllq, 1
3872IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3873IEMIMPL_MEDIA_OPT_F2 psrld, 1
3874IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3875IEMIMPL_MEDIA_OPT_F2 psraw, 1
3876IEMIMPL_MEDIA_OPT_F2 psrad, 1
3877IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3878IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3879IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3880IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3881IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3882IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3883IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3884IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3885IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3886IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3887IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3888IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3889IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3890IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3891IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3892IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3893IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3894IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3895IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3896IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3897
3898;;
3899; Media instruction working on one full sized and one half sized register (lower half).
3900;
3901; @param 1 The instruction
3902; @param 2 1 if MMX is included, 0 if not.
3903;
3904; @param A0 Pointer to the first full sized media register operand (input/output).
3905; @param A1 Pointer to the second half sized media register operand (input).
3906;
3907%macro IEMIMPL_MEDIA_F1L1 2
3908 %if %2 != 0
3909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3910 PROLOGUE_2_ARGS
3911 IEMIMPL_MMX_PROLOGUE
3912
3913 movq mm0, [A0]
3914 movq mm1, [A1]
3915 %1 mm0, mm1
3916 movq [A0], mm0
3917
3918 IEMIMPL_MMX_EPILOGUE
3919 EPILOGUE_2_ARGS
3920ENDPROC iemAImpl_ %+ %1 %+ _u64
3921 %endif
3922
3923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3924 PROLOGUE_2_ARGS
3925 IEMIMPL_SSE_PROLOGUE
3926
3927 movdqu xmm0, [A0]
3928 movdqu xmm1, [A1]
3929 %1 xmm0, xmm1
3930 movdqu [A0], xmm0
3931
3932 IEMIMPL_SSE_EPILOGUE
3933 EPILOGUE_2_ARGS
3934ENDPROC iemAImpl_ %+ %1 %+ _u128
3935%endmacro
3936
3937IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3938IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3939IEMIMPL_MEDIA_F1L1 punpckldq, 1
3940IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3941
3942
3943;;
3944; Media instruction working two half sized input registers (lower half) and a full sized
3945; destination register (vpunpckh*).
3946;
3947; @param 1 The instruction
3948;
3949; @param A0 Pointer to the destination register (full sized, output only).
3950; @param A1 Pointer to the first full sized media source register operand, where we
3951; will only use the lower half as input - but we'll be loading it in full.
3952; @param A2 Pointer to the second full sized media source register operand, where we
3953; will only use the lower half as input - but we'll be loading it in full.
3954;
3955%macro IEMIMPL_MEDIA_F1L1L1 1
3956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3957 PROLOGUE_3_ARGS
3958 IEMIMPL_AVX_PROLOGUE
3959
3960 vmovdqu xmm0, [A1]
3961 vmovdqu xmm1, [A2]
3962 %1 xmm0, xmm0, xmm1
3963 vmovdqu [A0], xmm0
3964
3965 IEMIMPL_AVX_PROLOGUE
3966 EPILOGUE_3_ARGS
3967ENDPROC iemAImpl_ %+ %1 %+ _u128
3968
3969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3970 PROLOGUE_3_ARGS
3971 IEMIMPL_AVX_PROLOGUE
3972
3973 vmovdqu ymm0, [A1]
3974 vmovdqu ymm1, [A2]
3975 %1 ymm0, ymm0, ymm1
3976 vmovdqu [A0], ymm0
3977
3978 IEMIMPL_AVX_PROLOGUE
3979 EPILOGUE_3_ARGS
3980ENDPROC iemAImpl_ %+ %1 %+ _u256
3981%endmacro
3982
3983IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3984IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3985IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3986IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3987
3988
3989;;
3990; Media instruction working on one full sized and one half sized register (high half).
3991;
3992; @param 1 The instruction
3993; @param 2 1 if MMX is included, 0 if not.
3994;
3995; @param A0 Pointer to the first full sized media register operand (input/output).
3996; @param A1 Pointer to the second full sized media register operand, where we
3997; will only use the upper half as input - but we'll load it in full.
3998;
3999%macro IEMIMPL_MEDIA_F1H1 2
4000IEMIMPL_MEDIA_F1L1 %1, %2
4001%endmacro
4002
4003IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4004IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4005IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4006IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4007
4008
4009;;
4010; Media instruction working two half sized input registers (high half) and a full sized
4011; destination register (vpunpckh*).
4012;
4013; @param 1 The instruction
4014;
4015; @param A0 Pointer to the destination register (full sized, output only).
4016; @param A1 Pointer to the first full sized media source register operand, where we
4017; will only use the upper half as input - but we'll be loading it in full.
4018; @param A2 Pointer to the second full sized media source register operand, where we
4019; will only use the upper half as input - but we'll be loading it in full.
4020;
4021%macro IEMIMPL_MEDIA_F1H1H1 1
4022IEMIMPL_MEDIA_F1L1L1 %1
4023%endmacro
4024
4025IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4026IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4027IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4028IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4029
4030
4031;
4032; Shufflers with evil 8-bit immediates.
4033;
4034
4035BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4036 PROLOGUE_3_ARGS
4037 IEMIMPL_MMX_PROLOGUE
4038
4039 movzx A2, A2_8 ; must clear top bits
4040 movq mm1, [A1]
4041 movq mm0, mm0 ; paranoia!
4042 lea T1, [.imm0 xWrtRIP]
4043 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4044 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4045 %else
4046 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4047 %endif
4048 lea T1, [T1 + T0]
4049 IBT_NOTRACK
4050 call T1
4051 movq [A0], mm0
4052
4053 IEMIMPL_MMX_EPILOGUE
4054 EPILOGUE_3_ARGS
4055%assign bImm 0
4056%rep 256
4057.imm %+ bImm:
4058 IBT_ENDBRxx_WITHOUT_NOTRACK
4059 pshufw mm0, mm1, bImm
4060 ret
4061 %assign bImm bImm + 1
4062%endrep
4063.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4064ENDPROC iemAImpl_pshufw_u64
4065
4066
4067%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4069 PROLOGUE_3_ARGS
4070 IEMIMPL_SSE_PROLOGUE
4071
4072 movzx A2, A2_8 ; must clear top bits
4073 movdqu xmm1, [A1]
4074 movdqu xmm0, xmm1 ; paranoia!
4075 lea T1, [.imm0 xWrtRIP]
4076 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4077 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4078 %else
4079 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4080 %endif
4081 lea T1, [T1 + T0*2]
4082 IBT_NOTRACK
4083 call T1
4084 movdqu [A0], xmm0
4085
4086 IEMIMPL_SSE_EPILOGUE
4087 EPILOGUE_3_ARGS
4088
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, xmm1, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4102IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4103IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4104
4105
4106%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4108 PROLOGUE_3_ARGS
4109 IEMIMPL_SSE_PROLOGUE
4110
4111 movzx A2, A2_8 ; must clear top bits
4112 vmovdqu ymm1, [A1]
4113 vmovdqu ymm0, ymm1 ; paranoia!
4114 lea T1, [.imm0 xWrtRIP]
4115 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4116 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4117 %else
4118 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4119 %endif
4120 lea T1, [T1 + T0*2]
4121 IBT_NOTRACK
4122 call T1
4123 vmovdqu [A0], ymm0
4124
4125 IEMIMPL_SSE_EPILOGUE
4126 EPILOGUE_3_ARGS
4127 %assign bImm 0
4128 %rep 256
4129.imm %+ bImm:
4130 IBT_ENDBRxx_WITHOUT_NOTRACK
4131 %1 ymm0, ymm1, bImm
4132 ret
4133 %assign bImm bImm + 1
4134 %endrep
4135.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4136ENDPROC iemAImpl_ %+ %1 %+ _u256
4137%endmacro
4138
4139IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4140IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4141IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4142
4143
4144;
4145; Shifts with evil 8-bit immediates.
4146;
4147
4148%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4149BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4150 PROLOGUE_2_ARGS
4151 IEMIMPL_MMX_PROLOGUE
4152
4153 movzx A1, A1_8 ; must clear top bits
4154 movq mm0, [A0]
4155 lea T1, [.imm0 xWrtRIP]
4156 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4157 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4158 %else
4159 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4160 %endif
4161 lea T1, [T1 + T0]
4162 IBT_NOTRACK
4163 call T1
4164 movq [A0], mm0
4165
4166 IEMIMPL_MMX_EPILOGUE
4167 EPILOGUE_2_ARGS
4168%assign bImm 0
4169%rep 256
4170.imm %+ bImm:
4171 IBT_ENDBRxx_WITHOUT_NOTRACK
4172 %1 mm0, bImm
4173 ret
4174 %assign bImm bImm + 1
4175%endrep
4176.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4177ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4178%endmacro
4179
4180IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4181IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4182IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4183IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4184IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4185IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4186IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4187IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4188
4189
4190%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4192 PROLOGUE_2_ARGS
4193 IEMIMPL_SSE_PROLOGUE
4194
4195 movzx A1, A1_8 ; must clear top bits
4196 movdqu xmm0, [A0]
4197 lea T1, [.imm0 xWrtRIP]
4198 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4199 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4200 %else
4201 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4202 %endif
4203 lea T1, [T1 + T0*2]
4204 IBT_NOTRACK
4205 call T1
4206 movdqu [A0], xmm0
4207
4208 IEMIMPL_SSE_EPILOGUE
4209 EPILOGUE_2_ARGS
4210 %assign bImm 0
4211 %rep 256
4212.imm %+ bImm:
4213 IBT_ENDBRxx_WITHOUT_NOTRACK
4214 %1 xmm0, bImm
4215 ret
4216 %assign bImm bImm + 1
4217 %endrep
4218.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4219ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4220%endmacro
4221
4222IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4223IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4224IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4225IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4226IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4227IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4228IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4229IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4230IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4231IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4232
4233
4234;
4235; Move byte mask.
4236;
4237
4238BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4239 PROLOGUE_2_ARGS
4240 IEMIMPL_MMX_PROLOGUE
4241
4242 movq mm1, [A1]
4243 pmovmskb T0, mm1
4244 mov [A0], T0
4245%ifdef RT_ARCH_X86
4246 mov dword [A0 + 4], 0
4247%endif
4248 IEMIMPL_MMX_EPILOGUE
4249 EPILOGUE_2_ARGS
4250ENDPROC iemAImpl_pmovmskb_u64
4251
4252BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4253 PROLOGUE_2_ARGS
4254 IEMIMPL_SSE_PROLOGUE
4255
4256 movdqu xmm1, [A1]
4257 pmovmskb T0, xmm1
4258 mov [A0], T0
4259%ifdef RT_ARCH_X86
4260 mov dword [A0 + 4], 0
4261%endif
4262 IEMIMPL_SSE_EPILOGUE
4263 EPILOGUE_2_ARGS
4264ENDPROC iemAImpl_pmovmskb_u128
4265
4266BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4267 PROLOGUE_2_ARGS
4268 IEMIMPL_AVX_PROLOGUE
4269
4270 vmovdqu ymm1, [A1]
4271 vpmovmskb T0, ymm1
4272 mov [A0], T0
4273%ifdef RT_ARCH_X86
4274 mov dword [A0 + 4], 0
4275%endif
4276 IEMIMPL_AVX_EPILOGUE
4277 EPILOGUE_2_ARGS
4278ENDPROC iemAImpl_vpmovmskb_u256
4279
4280
4281;;
4282; Media instruction working on two full sized source registers and one destination (AVX).
4283;
4284; @param 1 The instruction
4285;
4286; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4287; @param A1 Pointer to the destination media register size operand (output).
4288; @param A2 Pointer to the first source media register size operand (input).
4289; @param A3 Pointer to the second source media register size operand (input).
4290;
4291%macro IEMIMPL_MEDIA_F3 1
4292BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4293 PROLOGUE_4_ARGS
4294 IEMIMPL_AVX_PROLOGUE
4295
4296 vmovdqu xmm0, [A2]
4297 vmovdqu xmm1, [A3]
4298 %1 xmm0, xmm0, xmm1
4299 vmovdqu [A1], xmm0
4300
4301 IEMIMPL_AVX_PROLOGUE
4302 EPILOGUE_4_ARGS
4303ENDPROC iemAImpl_ %+ %1 %+ _u128
4304
4305BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4306 PROLOGUE_4_ARGS
4307 IEMIMPL_AVX_PROLOGUE
4308
4309 vmovdqu ymm0, [A2]
4310 vmovdqu ymm1, [A3]
4311 %1 ymm0, ymm0, ymm1
4312 vmovdqu [A1], ymm0
4313
4314 IEMIMPL_AVX_PROLOGUE
4315 EPILOGUE_4_ARGS
4316ENDPROC iemAImpl_ %+ %1 %+ _u256
4317%endmacro
4318
4319IEMIMPL_MEDIA_F3 vpshufb
4320IEMIMPL_MEDIA_F3 vpand
4321IEMIMPL_MEDIA_F3 vpminub
4322IEMIMPL_MEDIA_F3 vpminuw
4323IEMIMPL_MEDIA_F3 vpminud
4324IEMIMPL_MEDIA_F3 vpminsb
4325IEMIMPL_MEDIA_F3 vpminsw
4326IEMIMPL_MEDIA_F3 vpminsd
4327IEMIMPL_MEDIA_F3 vpmaxub
4328IEMIMPL_MEDIA_F3 vpmaxuw
4329IEMIMPL_MEDIA_F3 vpmaxud
4330IEMIMPL_MEDIA_F3 vpmaxsb
4331IEMIMPL_MEDIA_F3 vpmaxsw
4332IEMIMPL_MEDIA_F3 vpmaxsd
4333IEMIMPL_MEDIA_F3 vpandn
4334IEMIMPL_MEDIA_F3 vpor
4335IEMIMPL_MEDIA_F3 vpxor
4336IEMIMPL_MEDIA_F3 vpcmpeqb
4337IEMIMPL_MEDIA_F3 vpcmpeqw
4338IEMIMPL_MEDIA_F3 vpcmpeqd
4339IEMIMPL_MEDIA_F3 vpcmpeqq
4340IEMIMPL_MEDIA_F3 vpcmpgtb
4341IEMIMPL_MEDIA_F3 vpcmpgtw
4342IEMIMPL_MEDIA_F3 vpcmpgtd
4343IEMIMPL_MEDIA_F3 vpcmpgtq
4344IEMIMPL_MEDIA_F3 vpaddb
4345IEMIMPL_MEDIA_F3 vpaddw
4346IEMIMPL_MEDIA_F3 vpaddd
4347IEMIMPL_MEDIA_F3 vpaddq
4348IEMIMPL_MEDIA_F3 vpsubb
4349IEMIMPL_MEDIA_F3 vpsubw
4350IEMIMPL_MEDIA_F3 vpsubd
4351IEMIMPL_MEDIA_F3 vpsubq
4352
4353
4354;;
4355; Media instruction working on two full sized source registers and one destination (AVX),
4356; but no XSAVE state pointer argument.
4357;
4358; @param 1 The instruction
4359;
4360; @param A0 Pointer to the destination media register size operand (output).
4361; @param A1 Pointer to the first source media register size operand (input).
4362; @param A2 Pointer to the second source media register size operand (input).
4363;
4364%macro IEMIMPL_MEDIA_OPT_F3 1
4365BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4366 PROLOGUE_3_ARGS
4367 IEMIMPL_AVX_PROLOGUE
4368
4369 vmovdqu xmm0, [A1]
4370 vmovdqu xmm1, [A2]
4371 %1 xmm0, xmm0, xmm1
4372 vmovdqu [A0], xmm0
4373
4374 IEMIMPL_AVX_PROLOGUE
4375 EPILOGUE_3_ARGS
4376ENDPROC iemAImpl_ %+ %1 %+ _u128
4377
4378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4379 PROLOGUE_3_ARGS
4380 IEMIMPL_AVX_PROLOGUE
4381
4382 vmovdqu ymm0, [A1]
4383 vmovdqu ymm1, [A2]
4384 %1 ymm0, ymm0, ymm1
4385 vmovdqu [A0], ymm0
4386
4387 IEMIMPL_AVX_PROLOGUE
4388 EPILOGUE_3_ARGS
4389ENDPROC iemAImpl_ %+ %1 %+ _u256
4390%endmacro
4391
4392IEMIMPL_MEDIA_OPT_F3 vpacksswb
4393IEMIMPL_MEDIA_OPT_F3 vpackssdw
4394IEMIMPL_MEDIA_OPT_F3 vpackuswb
4395IEMIMPL_MEDIA_OPT_F3 vpackusdw
4396IEMIMPL_MEDIA_OPT_F3 vpmullw
4397IEMIMPL_MEDIA_OPT_F3 vpmulld
4398IEMIMPL_MEDIA_OPT_F3 vpmulhw
4399IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4400IEMIMPL_MEDIA_OPT_F3 vpavgb
4401IEMIMPL_MEDIA_OPT_F3 vpavgw
4402IEMIMPL_MEDIA_OPT_F3 vpsignb
4403IEMIMPL_MEDIA_OPT_F3 vpsignw
4404IEMIMPL_MEDIA_OPT_F3 vpsignd
4405IEMIMPL_MEDIA_OPT_F3 vphaddw
4406IEMIMPL_MEDIA_OPT_F3 vphaddd
4407IEMIMPL_MEDIA_OPT_F3 vphsubw
4408IEMIMPL_MEDIA_OPT_F3 vphsubd
4409IEMIMPL_MEDIA_OPT_F3 vphaddsw
4410IEMIMPL_MEDIA_OPT_F3 vphsubsw
4411IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4412IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4413IEMIMPL_MEDIA_OPT_F3 vpsadbw
4414IEMIMPL_MEDIA_OPT_F3 vpmuldq
4415IEMIMPL_MEDIA_OPT_F3 vpmuludq
4416IEMIMPL_MEDIA_OPT_F3 vunpcklps
4417IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4418IEMIMPL_MEDIA_OPT_F3 vunpckhps
4419IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4420IEMIMPL_MEDIA_OPT_F3 vpsubsb
4421IEMIMPL_MEDIA_OPT_F3 vpsubsw
4422IEMIMPL_MEDIA_OPT_F3 vpsubusb
4423IEMIMPL_MEDIA_OPT_F3 vpsubusw
4424IEMIMPL_MEDIA_OPT_F3 vpaddusb
4425IEMIMPL_MEDIA_OPT_F3 vpaddusw
4426IEMIMPL_MEDIA_OPT_F3 vpaddsb
4427IEMIMPL_MEDIA_OPT_F3 vpaddsw
4428
4429;;
4430; Media instruction working on one full sized source register, one full sized destination
4431; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4432; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4433; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4434; of either 16, 32, or 64, it acts like the max shift size)
4435;
4436; @param 1 The instruction
4437;
4438; @param A0 Pointer to the destination media register size operand (output).
4439; @param A1 Pointer to the first source media register size operand (input).
4440; @param A2 Pointer to the second source media register size operand (input).
4441;
4442%macro IEMIMPL_SHIFT_OPT_F3 1
4443BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4444 PROLOGUE_3_ARGS
4445 IEMIMPL_AVX_PROLOGUE
4446
4447 vmovdqu xmm0, [A1]
4448 vmovdqu xmm1, [A2]
4449 %1 xmm0, xmm0, xmm1
4450 vmovdqu [A0], xmm0
4451
4452 IEMIMPL_AVX_PROLOGUE
4453 EPILOGUE_3_ARGS
4454ENDPROC iemAImpl_ %+ %1 %+ _u128
4455
4456BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4457 PROLOGUE_3_ARGS
4458 IEMIMPL_AVX_PROLOGUE
4459
4460 vmovdqu ymm0, [A1]
4461 vmovdqu xmm1, [A2]
4462 %1 ymm0, ymm0, xmm1
4463 vmovdqu [A0], ymm0
4464
4465 IEMIMPL_AVX_PROLOGUE
4466 EPILOGUE_3_ARGS
4467ENDPROC iemAImpl_ %+ %1 %+ _u256
4468%endmacro
4469
4470IEMIMPL_SHIFT_OPT_F3 vpsllw
4471IEMIMPL_SHIFT_OPT_F3 vpslld
4472IEMIMPL_SHIFT_OPT_F3 vpsllq
4473
4474
4475;;
4476; Media instruction working on one full sized source registers and one destination (AVX),
4477; but no XSAVE state pointer argument.
4478;
4479; @param 1 The instruction
4480; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4481;
4482; @param A0 Pointer to the destination media register size operand (output).
4483; @param A1 Pointer to the source media register size operand (input).
4484;
4485%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4487 PROLOGUE_2_ARGS
4488 IEMIMPL_AVX_PROLOGUE
4489
4490 vmovdqu xmm0, [A1]
4491 %1 xmm0, xmm0
4492 vmovdqu [A0], xmm0
4493
4494 IEMIMPL_AVX_PROLOGUE
4495 EPILOGUE_2_ARGS
4496ENDPROC iemAImpl_ %+ %1 %+ _u128
4497
4498 %if %2 == 1
4499BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4500 PROLOGUE_2_ARGS
4501 IEMIMPL_AVX_PROLOGUE
4502
4503 vmovdqu ymm0, [A1]
4504 %1 ymm0, ymm0
4505 vmovdqu [A0], ymm0
4506
4507 IEMIMPL_AVX_PROLOGUE
4508 EPILOGUE_2_ARGS
4509ENDPROC iemAImpl_ %+ %1 %+ _u256
4510 %endif
4511%endmacro
4512
4513IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4514IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4515IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4516IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4517
4518
4519;
4520; The SSE 4.2 crc32
4521;
4522; @param A1 Pointer to the 32-bit destination.
4523; @param A2 The source operand, sized according to the suffix.
4524;
4525BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4526 PROLOGUE_2_ARGS
4527
4528 mov T0_32, [A0]
4529 crc32 T0_32, A1_8
4530 mov [A0], T0_32
4531
4532 EPILOGUE_2_ARGS
4533ENDPROC iemAImpl_crc32_u8
4534
4535BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4536 PROLOGUE_2_ARGS
4537
4538 mov T0_32, [A0]
4539 crc32 T0_32, A1_16
4540 mov [A0], T0_32
4541
4542 EPILOGUE_2_ARGS
4543ENDPROC iemAImpl_crc32_u16
4544
4545BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4546 PROLOGUE_2_ARGS
4547
4548 mov T0_32, [A0]
4549 crc32 T0_32, A1_32
4550 mov [A0], T0_32
4551
4552 EPILOGUE_2_ARGS
4553ENDPROC iemAImpl_crc32_u32
4554
4555%ifdef RT_ARCH_AMD64
4556BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4557 PROLOGUE_2_ARGS
4558
4559 mov T0_32, [A0]
4560 crc32 T0, A1
4561 mov [A0], T0_32
4562
4563 EPILOGUE_2_ARGS
4564ENDPROC iemAImpl_crc32_u64
4565%endif
4566
4567
4568;
4569; PTEST (SSE 4.1)
4570;
4571; @param A0 Pointer to the first source operand (aka readonly destination).
4572; @param A1 Pointer to the second source operand.
4573; @param A2 Pointer to the EFLAGS register.
4574;
4575BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4576 PROLOGUE_3_ARGS
4577 IEMIMPL_SSE_PROLOGUE
4578
4579 movdqu xmm0, [A0]
4580 movdqu xmm1, [A1]
4581 ptest xmm0, xmm1
4582 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4583
4584 IEMIMPL_SSE_EPILOGUE
4585 EPILOGUE_3_ARGS
4586ENDPROC iemAImpl_ptest_u128
4587
4588BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4589 PROLOGUE_3_ARGS
4590 IEMIMPL_SSE_PROLOGUE
4591
4592 vmovdqu ymm0, [A0]
4593 vmovdqu ymm1, [A1]
4594 vptest ymm0, ymm1
4595 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4596
4597 IEMIMPL_SSE_EPILOGUE
4598 EPILOGUE_3_ARGS
4599ENDPROC iemAImpl_vptest_u256
4600
4601
4602;;
4603; Template for the [v]pmov{s,z}x* instructions
4604;
4605; @param 1 The instruction
4606;
4607; @param A0 Pointer to the destination media register size operand (output).
4608; @param A1 The source operand value (input).
4609;
4610%macro IEMIMPL_V_PMOV_SZ_X 1
4611BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4612 PROLOGUE_2_ARGS
4613 IEMIMPL_SSE_PROLOGUE
4614
4615 movd xmm0, A1
4616 %1 xmm0, xmm0
4617 vmovdqu [A0], xmm0
4618
4619 IEMIMPL_SSE_PROLOGUE
4620 EPILOGUE_2_ARGS
4621ENDPROC iemAImpl_ %+ %1 %+ _u128
4622
4623BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4624 PROLOGUE_2_ARGS
4625 IEMIMPL_AVX_PROLOGUE
4626
4627 movd xmm0, A1
4628 v %+ %1 xmm0, xmm0
4629 vmovdqu [A0], xmm0
4630
4631 IEMIMPL_AVX_PROLOGUE
4632 EPILOGUE_2_ARGS
4633ENDPROC iemAImpl_v %+ %1 %+ _u128
4634
4635BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4636 PROLOGUE_2_ARGS
4637 IEMIMPL_AVX_PROLOGUE
4638
4639 movdqu xmm0, [A1]
4640 v %+ %1 ymm0, xmm0
4641 vmovdqu [A0], ymm0
4642
4643 IEMIMPL_AVX_PROLOGUE
4644 EPILOGUE_2_ARGS
4645ENDPROC iemAImpl_v %+ %1 %+ _u256
4646%endmacro
4647
4648IEMIMPL_V_PMOV_SZ_X pmovsxbw
4649IEMIMPL_V_PMOV_SZ_X pmovsxbd
4650IEMIMPL_V_PMOV_SZ_X pmovsxbq
4651IEMIMPL_V_PMOV_SZ_X pmovsxwd
4652IEMIMPL_V_PMOV_SZ_X pmovsxwq
4653IEMIMPL_V_PMOV_SZ_X pmovsxdq
4654
4655IEMIMPL_V_PMOV_SZ_X pmovzxbw
4656IEMIMPL_V_PMOV_SZ_X pmovzxbd
4657IEMIMPL_V_PMOV_SZ_X pmovzxbq
4658IEMIMPL_V_PMOV_SZ_X pmovzxwd
4659IEMIMPL_V_PMOV_SZ_X pmovzxwq
4660IEMIMPL_V_PMOV_SZ_X pmovzxdq
4661
4662
4663;;
4664; Need to move this as well somewhere better?
4665;
4666struc IEMSSERESULT
4667 .uResult resd 4
4668 .MXCSR resd 1
4669endstruc
4670
4671
4672;;
4673; Need to move this as well somewhere better?
4674;
4675struc IEMAVX128RESULT
4676 .uResult resd 4
4677 .MXCSR resd 1
4678endstruc
4679
4680
4681;;
4682; Need to move this as well somewhere better?
4683;
4684struc IEMAVX256RESULT
4685 .uResult resd 8
4686 .MXCSR resd 1
4687endstruc
4688
4689
4690;;
4691; Initialize the SSE MXCSR register using the guest value partially to
4692; account for rounding mode.
4693;
4694; @uses 4 bytes of stack to save the original value, T0.
4695; @param 1 Expression giving the address of the FXSTATE of the guest.
4696;
4697%macro SSE_LD_FXSTATE_MXCSR 1
4698 sub xSP, 4
4699
4700 stmxcsr [xSP]
4701 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4702 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4703 or T0_32, X86_MXCSR_XCPT_MASK
4704 sub xSP, 4
4705 mov [xSP], T0_32
4706 ldmxcsr [xSP]
4707 add xSP, 4
4708%endmacro
4709
4710
4711;;
4712; Restores the SSE MXCSR register with the original value.
4713;
4714; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4715; @param 1 Expression giving the address where to return the MXCSR value.
4716; @param 2 Expression giving the address of the FXSTATE of the guest.
4717;
4718; @note Restores the stack pointer.
4719;
4720%macro SSE_ST_FXSTATE_MXCSR 2
4721 sub xSP, 4
4722 stmxcsr [xSP]
4723 mov T0_32, [xSP]
4724 add xSP, 4
4725 ; Merge the status bits into the original MXCSR value.
4726 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4727 and T0_32, X86_MXCSR_XCPT_FLAGS
4728 or T0_32, T1_32
4729 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4730
4731 ldmxcsr [xSP]
4732 add xSP, 4
4733%endmacro
4734
4735
4736;;
4737; Initialize the SSE MXCSR register using the guest value partially to
4738; account for rounding mode.
4739;
4740; @uses 4 bytes of stack to save the original value.
4741; @param 1 Expression giving the address of the FXSTATE of the guest.
4742;
4743%macro AVX_LD_XSAVEAREA_MXCSR 1
4744 sub xSP, 4
4745
4746 stmxcsr [xSP]
4747 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4748 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4749 sub xSP, 4
4750 mov [xSP], T0_32
4751 ldmxcsr [xSP]
4752 add xSP, 4
4753%endmacro
4754
4755
4756;;
4757; Restores the AVX128 MXCSR register with the original value.
4758;
4759; @param 1 Expression giving the address where to return the MXCSR value.
4760;
4761; @note Restores the stack pointer.
4762;
4763%macro AVX128_ST_XSAVEAREA_MXCSR 1
4764 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4765
4766 ldmxcsr [xSP]
4767 add xSP, 4
4768%endmacro
4769
4770
4771;;
4772; Restores the AVX256 MXCSR register with the original value.
4773;
4774; @param 1 Expression giving the address where to return the MXCSR value.
4775;
4776; @note Restores the stack pointer.
4777;
4778%macro AVX256_ST_XSAVEAREA_MXCSR 1
4779 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4780
4781 ldmxcsr [xSP]
4782 add xSP, 4
4783%endmacro
4784
4785
4786;;
4787; Floating point instruction working on two full sized registers.
4788;
4789; @param 1 The instruction
4790; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4791;
4792; @param A0 FPU context (FXSTATE or XSAVEAREA).
4793; @param A1 Where to return the result including the MXCSR value.
4794; @param A2 Pointer to the first media register size operand (input/output).
4795; @param A3 Pointer to the second media register size operand (input).
4796;
4797%macro IEMIMPL_FP_F2 2
4798BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4799 PROLOGUE_4_ARGS
4800 IEMIMPL_SSE_PROLOGUE
4801 SSE_LD_FXSTATE_MXCSR A0
4802
4803 movdqu xmm0, [A2]
4804 movdqu xmm1, [A3]
4805 %1 xmm0, xmm1
4806 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4807
4808 SSE_ST_FXSTATE_MXCSR A1, A0
4809 IEMIMPL_SSE_PROLOGUE
4810 EPILOGUE_4_ARGS
4811ENDPROC iemAImpl_ %+ %1 %+ _u128
4812
4813 %if %2 == 3
4814BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4815 PROLOGUE_4_ARGS
4816 IEMIMPL_AVX_PROLOGUE
4817 AVX_LD_XSAVEAREA_MXCSR A0
4818
4819 vmovdqu xmm0, [A2]
4820 vmovdqu xmm1, [A3]
4821 v %+ %1 xmm0, xmm0, xmm1
4822 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4823
4824 AVX128_ST_XSAVEAREA_MXCSR A1
4825 IEMIMPL_AVX_PROLOGUE
4826 EPILOGUE_4_ARGS
4827ENDPROC iemAImpl_v %+ %1 %+ _u128
4828
4829BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4830 PROLOGUE_4_ARGS
4831 IEMIMPL_AVX_PROLOGUE
4832 AVX_LD_XSAVEAREA_MXCSR A0
4833
4834 vmovdqu ymm0, [A2]
4835 vmovdqu ymm1, [A3]
4836 v %+ %1 ymm0, ymm0, ymm1
4837 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4838
4839 AVX256_ST_XSAVEAREA_MXCSR A1
4840 IEMIMPL_AVX_PROLOGUE
4841 EPILOGUE_4_ARGS
4842ENDPROC iemAImpl_v %+ %1 %+ _u256
4843 %elif %2 == 2
4844BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4845 PROLOGUE_4_ARGS
4846 IEMIMPL_AVX_PROLOGUE
4847 AVX_LD_XSAVEAREA_MXCSR A0
4848
4849 vmovdqu xmm0, [A2]
4850 vmovdqu xmm1, [A3]
4851 v %+ %1 xmm0, xmm1
4852 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4853
4854 AVX128_ST_XSAVEAREA_MXCSR A1
4855 IEMIMPL_AVX_PROLOGUE
4856 EPILOGUE_4_ARGS
4857ENDPROC iemAImpl_v %+ %1 %+ _u128
4858
4859BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4860 PROLOGUE_4_ARGS
4861 IEMIMPL_AVX_PROLOGUE
4862 AVX_LD_XSAVEAREA_MXCSR A0
4863
4864 vmovdqu ymm0, [A2]
4865 vmovdqu ymm1, [A3]
4866 v %+ %1 ymm0, ymm1
4867 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4868
4869 AVX256_ST_XSAVEAREA_MXCSR A1
4870 IEMIMPL_AVX_PROLOGUE
4871 EPILOGUE_4_ARGS
4872ENDPROC iemAImpl_v %+ %1 %+ _u256
4873 %endif
4874%endmacro
4875
4876IEMIMPL_FP_F2 addps, 3
4877IEMIMPL_FP_F2 addpd, 3
4878IEMIMPL_FP_F2 mulps, 3
4879IEMIMPL_FP_F2 mulpd, 3
4880IEMIMPL_FP_F2 subps, 3
4881IEMIMPL_FP_F2 subpd, 3
4882IEMIMPL_FP_F2 minps, 3
4883IEMIMPL_FP_F2 minpd, 3
4884IEMIMPL_FP_F2 divps, 3
4885IEMIMPL_FP_F2 divpd, 3
4886IEMIMPL_FP_F2 maxps, 3
4887IEMIMPL_FP_F2 maxpd, 3
4888IEMIMPL_FP_F2 haddps, 3
4889IEMIMPL_FP_F2 haddpd, 3
4890IEMIMPL_FP_F2 hsubps, 3
4891IEMIMPL_FP_F2 hsubpd, 3
4892IEMIMPL_FP_F2 addsubps, 3
4893IEMIMPL_FP_F2 addsubpd, 3
4894
4895
4896;;
4897; These are actually unary operations but to keep it simple
4898; we treat them as binary for now, so the output result is
4899; always in sync with the register where the result might get written
4900; to.
4901IEMIMPL_FP_F2 sqrtps, 2
4902IEMIMPL_FP_F2 rsqrtps, 2
4903IEMIMPL_FP_F2 sqrtpd, 2
4904IEMIMPL_FP_F2 rcpps, 2
4905IEMIMPL_FP_F2 cvtdq2ps, 2
4906IEMIMPL_FP_F2 cvtps2dq, 2
4907IEMIMPL_FP_F2 cvttps2dq, 2
4908IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4909IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4910IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4911
4912
4913;;
4914; Floating point instruction working on a full sized register and a single precision operand.
4915;
4916; @param 1 The instruction
4917;
4918; @param A0 FPU context (FXSTATE or XSAVEAREA).
4919; @param A1 Where to return the result including the MXCSR value.
4920; @param A2 Pointer to the first media register size operand (input/output).
4921; @param A3 Pointer to the second single precision floating point value (input).
4922;
4923%macro IEMIMPL_FP_F2_R32 1
4924BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4925 PROLOGUE_4_ARGS
4926 IEMIMPL_SSE_PROLOGUE
4927 SSE_LD_FXSTATE_MXCSR A0
4928
4929 movdqu xmm0, [A2]
4930 movd xmm1, [A3]
4931 %1 xmm0, xmm1
4932 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4933
4934 SSE_ST_FXSTATE_MXCSR A1, A0
4935 IEMIMPL_SSE_EPILOGUE
4936 EPILOGUE_4_ARGS
4937ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4938
4939BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4940 PROLOGUE_4_ARGS
4941 IEMIMPL_AVX_PROLOGUE
4942 AVX_LD_XSAVEAREA_MXCSR A0
4943
4944 vmovdqu xmm0, [A2]
4945 vmovd xmm1, [A3]
4946 v %+ %1 xmm0, xmm0, xmm1
4947 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4948
4949 AVX128_ST_XSAVEAREA_MXCSR A1
4950 IEMIMPL_AVX_PROLOGUE
4951 EPILOGUE_4_ARGS
4952ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4953%endmacro
4954
4955IEMIMPL_FP_F2_R32 addss
4956IEMIMPL_FP_F2_R32 mulss
4957IEMIMPL_FP_F2_R32 subss
4958IEMIMPL_FP_F2_R32 minss
4959IEMIMPL_FP_F2_R32 divss
4960IEMIMPL_FP_F2_R32 maxss
4961IEMIMPL_FP_F2_R32 cvtss2sd
4962IEMIMPL_FP_F2_R32 sqrtss
4963IEMIMPL_FP_F2_R32 rsqrtss
4964IEMIMPL_FP_F2_R32 rcpss
4965
4966
4967;;
4968; Floating point instruction working on a full sized register and a double precision operand.
4969;
4970; @param 1 The instruction
4971;
4972; @param A0 FPU context (FXSTATE or XSAVEAREA).
4973; @param A1 Where to return the result including the MXCSR value.
4974; @param A2 Pointer to the first media register size operand (input/output).
4975; @param A3 Pointer to the second double precision floating point value (input).
4976;
4977%macro IEMIMPL_FP_F2_R64 1
4978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4979 PROLOGUE_4_ARGS
4980 IEMIMPL_SSE_PROLOGUE
4981 SSE_LD_FXSTATE_MXCSR A0
4982
4983 movdqu xmm0, [A2]
4984 movq xmm1, [A3]
4985 %1 xmm0, xmm1
4986 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4987
4988 SSE_ST_FXSTATE_MXCSR A1, A0
4989 IEMIMPL_SSE_EPILOGUE
4990 EPILOGUE_4_ARGS
4991ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4992
4993BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4994 PROLOGUE_4_ARGS
4995 IEMIMPL_AVX_PROLOGUE
4996 AVX_LD_XSAVEAREA_MXCSR A0
4997
4998 vmovdqu xmm0, [A2]
4999 vmovq xmm1, [A3]
5000 v %+ %1 xmm0, xmm0, xmm1
5001 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5002
5003 AVX128_ST_XSAVEAREA_MXCSR A1
5004 IEMIMPL_AVX_EPILOGUE
5005 EPILOGUE_4_ARGS
5006ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5007%endmacro
5008
5009IEMIMPL_FP_F2_R64 addsd
5010IEMIMPL_FP_F2_R64 mulsd
5011IEMIMPL_FP_F2_R64 subsd
5012IEMIMPL_FP_F2_R64 minsd
5013IEMIMPL_FP_F2_R64 divsd
5014IEMIMPL_FP_F2_R64 maxsd
5015IEMIMPL_FP_F2_R64 cvtsd2ss
5016IEMIMPL_FP_F2_R64 sqrtsd
5017
5018
5019;;
5020; Macro for the cvtpd2ps/cvtps2pd instructions.
5021;
5022; 1 The instruction name.
5023; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5024;
5025; @param A0 FPU context (FXSTATE or XSAVEAREA).
5026; @param A1 Where to return the result including the MXCSR value.
5027; @param A2 Pointer to the first media register size operand (input/output).
5028; @param A3 Pointer to the second media register size operand (input).
5029;
5030%macro IEMIMPL_CVT_F2 2
5031BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5032 PROLOGUE_4_ARGS
5033 IEMIMPL_SSE_PROLOGUE
5034 SSE_LD_FXSTATE_MXCSR A0
5035
5036 movdqu xmm0, [A2]
5037 movdqu xmm1, [A3]
5038 %1 xmm0, xmm1
5039 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5040
5041 SSE_ST_FXSTATE_MXCSR A1, A0
5042 IEMIMPL_SSE_EPILOGUE
5043 EPILOGUE_4_ARGS
5044ENDPROC iemAImpl_ %+ %1 %+ _u128
5045
5046BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5047 PROLOGUE_4_ARGS
5048 IEMIMPL_AVX_PROLOGUE
5049 AVX_LD_XSAVEAREA_MXCSR A0
5050
5051 vmovdqu xmm0, [A2]
5052 vmovdqu xmm1, [A3]
5053 v %+ %1 xmm0, xmm1
5054 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5055
5056 AVX128_ST_XSAVEAREA_MXCSR A1
5057 IEMIMPL_AVX_EPILOGUE
5058 EPILOGUE_4_ARGS
5059ENDPROC iemAImpl_v %+ %1 %+ _u128
5060
5061BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5062 PROLOGUE_4_ARGS
5063 IEMIMPL_AVX_PROLOGUE
5064 AVX_LD_XSAVEAREA_MXCSR A0
5065
5066 vmovdqu ymm0, [A2]
5067 vmovdqu ymm1, [A3]
5068 %if %2 == 0
5069 v %+ %1 xmm0, ymm1
5070 %else
5071 v %+ %1 ymm0, xmm1
5072 %endif
5073 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5074
5075 AVX256_ST_XSAVEAREA_MXCSR A1
5076 IEMIMPL_AVX_EPILOGUE
5077 EPILOGUE_4_ARGS
5078ENDPROC iemAImpl_v %+ %1 %+ _u256
5079%endmacro
5080
5081IEMIMPL_CVT_F2 cvtpd2ps, 0
5082IEMIMPL_CVT_F2 cvtps2pd, 1
5083
5084
5085;;
5086; shufps instructions with 8-bit immediates.
5087;
5088; @param A0 Pointer to the destination media register size operand (input/output).
5089; @param A1 Pointer to the first source media register size operand (input).
5090; @param A2 The 8-bit immediate
5091;
5092BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5093 PROLOGUE_3_ARGS
5094 IEMIMPL_SSE_PROLOGUE
5095
5096 movzx A2, A2_8 ; must clear top bits
5097 movdqu xmm0, [A0]
5098 movdqu xmm1, [A1]
5099 lea T1, [.imm0 xWrtRIP]
5100 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5101 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5102 %else
5103 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5104 %endif
5105 lea T1, [T1 + T0*2]
5106 IBT_NOTRACK
5107 call T1
5108 movdqu [A0], xmm0
5109
5110 IEMIMPL_SSE_EPILOGUE
5111 EPILOGUE_3_ARGS
5112 %assign bImm 0
5113 %rep 256
5114.imm %+ bImm:
5115 IBT_ENDBRxx_WITHOUT_NOTRACK
5116 shufps xmm0, xmm1, bImm
5117 ret
5118 int3
5119 %assign bImm bImm + 1
5120 %endrep
5121.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5122ENDPROC iemAImpl_shufps_u128
5123
5124
5125;;
5126; shufpd instruction with 8-bit immediates.
5127;
5128; @param A0 Pointer to the destination media register size operand (input/output).
5129; @param A1 Pointer to the first source media register size operand (input).
5130; @param A2 The 8-bit immediate
5131;
5132BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5133 PROLOGUE_3_ARGS
5134 IEMIMPL_SSE_PROLOGUE
5135
5136 movzx A2, A2_8 ; must clear top bits
5137 movdqu xmm0, [A0]
5138 movdqu xmm1, [A1]
5139 lea T1, [.imm0 xWrtRIP]
5140 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5141 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5142 %else
5143 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5144 %endif
5145 lea T1, [T1 + T0*2]
5146 IBT_NOTRACK
5147 call T1
5148 movdqu [A0], xmm0
5149
5150 IEMIMPL_SSE_EPILOGUE
5151 EPILOGUE_3_ARGS
5152 %assign bImm 0
5153 %rep 256
5154.imm %+ bImm:
5155 IBT_ENDBRxx_WITHOUT_NOTRACK
5156 shufpd xmm0, xmm1, bImm
5157 ret
5158 %assign bImm bImm + 1
5159 %endrep
5160.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5161ENDPROC iemAImpl_shufpd_u128
5162
5163
5164;;
5165; vshufp{s,d} instructions with 8-bit immediates.
5166;
5167; @param 1 The instruction name.
5168;
5169; @param A0 Pointer to the destination media register size operand (output).
5170; @param A1 Pointer to the first source media register size operand (input).
5171; @param A2 Pointer to the second source media register size operand (input).
5172; @param A3 The 8-bit immediate
5173;
5174%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5175BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5176 PROLOGUE_4_ARGS
5177 IEMIMPL_AVX_PROLOGUE
5178
5179 movzx A3, A3_8 ; must clear top bits
5180 movdqu xmm0, [A1]
5181 movdqu xmm1, [A2]
5182 lea T1, [.imm0 xWrtRIP]
5183 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5184 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5185 %else
5186 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5187 %endif
5188 lea T1, [T1 + T0*2]
5189 IBT_NOTRACK
5190 call T1
5191 movdqu [A0], xmm0
5192
5193 IEMIMPL_AVX_EPILOGUE
5194 EPILOGUE_4_ARGS
5195 %assign bImm 0
5196 %rep 256
5197.imm %+ bImm:
5198 IBT_ENDBRxx_WITHOUT_NOTRACK
5199 %1 xmm0, xmm0, xmm1, bImm
5200 ret
5201 %assign bImm bImm + 1
5202 %endrep
5203.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5204ENDPROC iemAImpl_ %+ %1 %+ _u128
5205
5206BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5207 PROLOGUE_4_ARGS
5208 IEMIMPL_AVX_PROLOGUE
5209
5210 movzx A3, A3_8 ; must clear top bits
5211 vmovdqu ymm0, [A1]
5212 vmovdqu ymm1, [A2]
5213 lea T1, [.imm0 xWrtRIP]
5214 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5215 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5216 %else
5217 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5218 %endif
5219 lea T1, [T1 + T0*2]
5220 IBT_NOTRACK
5221 call T1
5222 vmovdqu [A0], ymm0
5223
5224 IEMIMPL_AVX_EPILOGUE
5225 EPILOGUE_4_ARGS
5226 %assign bImm 0
5227 %rep 256
5228.imm %+ bImm:
5229 IBT_ENDBRxx_WITHOUT_NOTRACK
5230 %1 ymm0, ymm0, ymm1, bImm
5231 ret
5232 %assign bImm bImm + 1
5233 %endrep
5234.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5235ENDPROC iemAImpl_ %+ %1 %+ _u256
5236%endmacro
5237
5238IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5239IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5240
5241
5242;;
5243; One of the [p]blendv{b,ps,pd} variants
5244;
5245; @param 1 The instruction
5246;
5247; @param A0 Pointer to the first media register sized operand (input/output).
5248; @param A1 Pointer to the second media sized value (input).
5249; @param A2 Pointer to the media register sized mask value (input).
5250;
5251%macro IEMIMPL_P_BLEND 1
5252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5253 PROLOGUE_3_ARGS
5254 IEMIMPL_SSE_PROLOGUE
5255
5256 movdqu xmm0, [A2] ; This is implicit
5257 movdqu xmm1, [A0]
5258 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5259 %1 xmm1, xmm2
5260 movdqu [A0], xmm1
5261
5262 IEMIMPL_SSE_PROLOGUE
5263 EPILOGUE_3_ARGS
5264ENDPROC iemAImpl_ %+ %1 %+ _u128
5265%endmacro
5266
5267IEMIMPL_P_BLEND pblendvb
5268IEMIMPL_P_BLEND blendvps
5269IEMIMPL_P_BLEND blendvpd
5270
5271
5272;;
5273; One of the v[p]blendv{b,ps,pd} variants
5274;
5275; @param 1 The instruction
5276;
5277; @param A0 Pointer to the first media register sized operand (output).
5278; @param A1 Pointer to the first media register sized operand (input).
5279; @param A2 Pointer to the second media register sized operand (input).
5280; @param A3 Pointer to the media register sized mask value (input).
5281%macro IEMIMPL_AVX_P_BLEND 1
5282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5283 PROLOGUE_4_ARGS
5284 IEMIMPL_AVX_PROLOGUE
5285
5286 vmovdqu xmm0, [A1]
5287 vmovdqu xmm1, [A2]
5288 vmovdqu xmm2, [A3]
5289 %1 xmm0, xmm0, xmm1, xmm2
5290 vmovdqu [A0], xmm0
5291
5292 IEMIMPL_AVX_PROLOGUE
5293 EPILOGUE_4_ARGS
5294ENDPROC iemAImpl_ %+ %1 %+ _u128
5295
5296BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5297 PROLOGUE_4_ARGS
5298 IEMIMPL_AVX_PROLOGUE
5299
5300 vmovdqu ymm0, [A1]
5301 vmovdqu ymm1, [A2]
5302 vmovdqu ymm2, [A3]
5303 %1 ymm0, ymm0, ymm1, ymm2
5304 vmovdqu [A0], ymm0
5305
5306 IEMIMPL_AVX_PROLOGUE
5307 EPILOGUE_4_ARGS
5308ENDPROC iemAImpl_ %+ %1 %+ _u256
5309%endmacro
5310
5311IEMIMPL_AVX_P_BLEND vpblendvb
5312IEMIMPL_AVX_P_BLEND vblendvps
5313IEMIMPL_AVX_P_BLEND vblendvpd
5314
5315
5316;;
5317; palignr mm1, mm2/m64 instruction.
5318;
5319; @param A0 Pointer to the first media register sized operand (output).
5320; @param A1 The second register sized operand (input).
5321; @param A2 The 8-bit immediate.
5322BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5323 PROLOGUE_3_ARGS
5324 IEMIMPL_MMX_PROLOGUE
5325
5326 movzx A2, A2_8 ; must clear top bits
5327 movq mm0, [A0]
5328 movq mm1, A1
5329 lea T1, [.imm0 xWrtRIP]
5330 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5331 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5332 %else
5333 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5334 %endif
5335 lea T1, [T1 + T0*2]
5336 IBT_NOTRACK
5337 call T1
5338 movq [A0], mm0
5339
5340 IEMIMPL_MMX_EPILOGUE
5341 EPILOGUE_3_ARGS
5342 %assign bImm 0
5343 %rep 256
5344.imm %+ bImm:
5345 IBT_ENDBRxx_WITHOUT_NOTRACK
5346 palignr mm0, mm1, bImm
5347 ret
5348 %assign bImm bImm + 1
5349 %endrep
5350.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5351ENDPROC iemAImpl_palignr_u64
5352
5353
5354;;
5355; SSE instructions with 8-bit immediates of the form
5356; xxx xmm1, xmm2, imm8.
5357; where the instruction encoding takes up 6 bytes.
5358;
5359; @param 1 The instruction name.
5360;
5361; @param A0 Pointer to the first media register size operand (input/output).
5362; @param A1 Pointer to the second source media register size operand (input).
5363; @param A2 The 8-bit immediate
5364;
5365%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5367 PROLOGUE_3_ARGS
5368 IEMIMPL_SSE_PROLOGUE
5369
5370 movzx A2, A2_8 ; must clear top bits
5371 movdqu xmm0, [A0]
5372 movdqu xmm1, [A1]
5373 lea T1, [.imm0 xWrtRIP]
5374 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5375 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5376 lea T1, [T1 + T0*4]
5377 %else
5378 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5379 %endif
5380 IBT_NOTRACK
5381 call T1
5382 movdqu [A0], xmm0
5383
5384 IEMIMPL_SSE_EPILOGUE
5385 EPILOGUE_3_ARGS
5386 %assign bImm 0
5387 %rep 256
5388.imm %+ bImm:
5389 IBT_ENDBRxx_WITHOUT_NOTRACK
5390 %1 xmm0, xmm1, bImm
5391 ret
5392 int3
5393 %assign bImm bImm + 1
5394 %endrep
5395.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5396ENDPROC iemAImpl_ %+ %1 %+ _u128
5397%endmacro
5398
5399IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5400IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5401IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5402IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5403IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5404IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5405IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5406
5407
5408;;
5409; AVX instructions with 8-bit immediates of the form
5410; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5411; where the instruction encoding takes up 6 bytes.
5412;
5413; @param 1 The instruction name.
5414; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5415; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5416;
5417; @param A0 Pointer to the destination media register size operand (output).
5418; @param A1 Pointer to the first source media register size operand (input).
5419; @param A2 Pointer to the second source media register size operand (input).
5420; @param A3 The 8-bit immediate
5421;
5422%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5423 %if %2 == 1
5424BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5425 PROLOGUE_4_ARGS
5426 IEMIMPL_AVX_PROLOGUE
5427
5428 movzx A3, A3_8 ; must clear top bits
5429 movdqu xmm0, [A1]
5430 movdqu xmm1, [A2]
5431 lea T1, [.imm0 xWrtRIP]
5432 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5433 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5434 lea T1, [T1 + T0*4]
5435 %else
5436 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5437 %endif
5438 IBT_NOTRACK
5439 call T1
5440 movdqu [A0], xmm0
5441
5442 IEMIMPL_AVX_EPILOGUE
5443 EPILOGUE_4_ARGS
5444 %assign bImm 0
5445 %rep 256
5446.imm %+ bImm:
5447 IBT_ENDBRxx_WITHOUT_NOTRACK
5448 %1 xmm0, xmm0, xmm1, bImm
5449 ret
5450 int3
5451 %assign bImm bImm + 1
5452 %endrep
5453.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5454ENDPROC iemAImpl_ %+ %1 %+ _u128
5455 %endif
5456
5457 %if %3 == 1
5458BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5459 PROLOGUE_4_ARGS
5460 IEMIMPL_AVX_PROLOGUE
5461
5462 movzx A3, A3_8 ; must clear top bits
5463 vmovdqu ymm0, [A1]
5464 vmovdqu ymm1, [A2]
5465 lea T1, [.imm0 xWrtRIP]
5466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5467 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5468 lea T1, [T1 + T0*4]
5469 %else
5470 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5471 %endif
5472 IBT_NOTRACK
5473 call T1
5474 vmovdqu [A0], ymm0
5475
5476 IEMIMPL_AVX_EPILOGUE
5477 EPILOGUE_4_ARGS
5478 %assign bImm 0
5479 %rep 256
5480.imm %+ bImm:
5481 IBT_ENDBRxx_WITHOUT_NOTRACK
5482 %1 ymm0, ymm0, ymm1, bImm
5483 ret
5484 int3
5485 %assign bImm bImm + 1
5486 %endrep
5487.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5488ENDPROC iemAImpl_ %+ %1 %+ _u256
5489 %endif
5490%endmacro
5491
5492IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5493IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5494IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5495IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5496IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5497IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5498IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5499IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5500
5501
5502;;
5503; Need to move this as well somewhere better?
5504;
5505struc IEMPCMPISTRXSRC
5506 .uSrc1 resd 4
5507 .uSrc2 resd 4
5508endstruc
5509
5510struc IEMPCMPESTRXSRC
5511 .uSrc1 resd 4
5512 .uSrc2 resd 4
5513 .u64Rax resd 2
5514 .u64Rdx resd 2
5515endstruc
5516
5517;;
5518; The pcmpistri instruction.
5519;
5520; @param A0 Pointer to the ECX register to store the result to (output).
5521; @param A1 Pointer to the EFLAGS register.
5522; @param A2 Pointer to the structure containing the source operands (input).
5523; @param A3 The 8-bit immediate
5524;
5525BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5526 PROLOGUE_4_ARGS
5527 IEMIMPL_SSE_PROLOGUE
5528
5529 movzx A3, A3_8 ; must clear top bits
5530 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5531 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5532 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5533 lea T1, [.imm0 xWrtRIP]
5534 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5535 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5536 lea T1, [T1 + T0*4]
5537 %else
5538 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5539 %endif
5540 IBT_NOTRACK
5541 call T1
5542
5543 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5544 mov [T2], ecx
5545
5546 IEMIMPL_SSE_EPILOGUE
5547 EPILOGUE_4_ARGS
5548 %assign bImm 0
5549 %rep 256
5550.imm %+ bImm:
5551 IBT_ENDBRxx_WITHOUT_NOTRACK
5552 pcmpistri xmm0, xmm1, bImm
5553 ret
5554 int3
5555 %assign bImm bImm + 1
5556 %endrep
5557.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5558ENDPROC iemAImpl_pcmpistri_u128
5559
5560;;
5561; The pcmpestri instruction.
5562;
5563; @param A0 Pointer to the ECX register to store the result to (output).
5564; @param A1 Pointer to the EFLAGS register.
5565; @param A2 Pointer to the structure containing the source operands (input).
5566; @param A3 The 8-bit immediate
5567;
5568BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5569 PROLOGUE_4_ARGS
5570 IEMIMPL_SSE_PROLOGUE
5571
5572 movzx A3, A3_8 ; must clear top bits
5573 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5574 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5575 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5576 lea T1, [.imm0 xWrtRIP]
5577 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5578 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5579 lea T1, [T1 + T0*4]
5580 %else
5581 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5582 %endif
5583 push xDX ; xDX can be A1 or A2 depending on the calling convention
5584 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5585 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5586 IBT_NOTRACK
5587 call T1
5588
5589 pop xDX
5590 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5591 mov [T2], ecx
5592
5593 IEMIMPL_SSE_EPILOGUE
5594 EPILOGUE_4_ARGS
5595 %assign bImm 0
5596 %rep 256
5597.imm %+ bImm:
5598 IBT_ENDBRxx_WITHOUT_NOTRACK
5599 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5600 pcmpestri xmm0, xmm1, bImm
5601 ret
5602 %assign bImm bImm + 1
5603 %endrep
5604.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5605ENDPROC iemAImpl_pcmpestri_u128
5606
5607;;
5608; The pcmpistrm instruction template.
5609;
5610; @param A0 Pointer to the XMM0 register to store the result to (output).
5611; @param A1 Pointer to the EFLAGS register.
5612; @param A2 Pointer to the structure containing the source operands (input).
5613; @param A3 The 8-bit immediate
5614;
5615BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5616 PROLOGUE_4_ARGS
5617 IEMIMPL_SSE_PROLOGUE
5618
5619 movzx A3, A3_8 ; must clear top bits
5620 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5621 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5622 lea T1, [.imm0 xWrtRIP]
5623 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5624 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5625 lea T1, [T1 + T0*4]
5626 %else
5627 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5628 %endif
5629 IBT_NOTRACK
5630 call T1
5631
5632 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5633 movdqu [A0], xmm0
5634
5635 IEMIMPL_SSE_EPILOGUE
5636 EPILOGUE_4_ARGS
5637 %assign bImm 0
5638 %rep 256
5639.imm %+ bImm:
5640 IBT_ENDBRxx_WITHOUT_NOTRACK
5641 pcmpistrm xmm1, xmm2, bImm
5642 ret
5643 int3
5644 %assign bImm bImm + 1
5645 %endrep
5646.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5647ENDPROC iemAImpl_pcmpistrm_u128
5648
5649;;
5650; The pcmpestrm instruction template.
5651;
5652; @param A0 Pointer to the XMM0 register to store the result to (output).
5653; @param A1 Pointer to the EFLAGS register.
5654; @param A2 Pointer to the structure containing the source operands (input).
5655; @param A3 The 8-bit immediate
5656;
5657BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5658 PROLOGUE_4_ARGS
5659 IEMIMPL_SSE_PROLOGUE
5660
5661 movzx A3, A3_8 ; must clear top bits
5662 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5663 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5664 lea T1, [.imm0 xWrtRIP]
5665 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5666 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5667 lea T1, [T1 + T0*4]
5668 %else
5669 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5670 %endif
5671 push xDX ; xDX can be A1 or A2 depending on the calling convention
5672 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5673 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5674 IBT_NOTRACK
5675 call T1
5676
5677 pop xDX
5678 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5679 movdqu [A0], xmm0
5680
5681 IEMIMPL_SSE_EPILOGUE
5682 EPILOGUE_4_ARGS
5683 %assign bImm 0
5684 %rep 256
5685.imm %+ bImm:
5686 IBT_ENDBRxx_WITHOUT_NOTRACK
5687 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5688 pcmpestrm xmm1, xmm2, bImm
5689 ret
5690 %assign bImm bImm + 1
5691 %endrep
5692.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5693ENDPROC iemAImpl_pcmpestrm_u128
5694
5695
5696;;
5697; pinsrw instruction.
5698;
5699; @param A0 Pointer to the first media register size operand (input/output).
5700; @param A1 The 16 bit input operand (input).
5701; @param A2 The 8-bit immediate
5702;
5703BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5704 PROLOGUE_3_ARGS
5705 IEMIMPL_SSE_PROLOGUE
5706
5707 movzx A2, A2_8 ; must clear top bits
5708 movq mm0, [A0]
5709 lea T1, [.imm0 xWrtRIP]
5710 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5711 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5712 %else
5713 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5714 %endif
5715 lea T1, [T1 + T0]
5716 IBT_NOTRACK
5717 call T1
5718 movq [A0], mm0
5719
5720 IEMIMPL_SSE_EPILOGUE
5721 EPILOGUE_3_ARGS
5722 %assign bImm 0
5723 %rep 256
5724.imm %+ bImm:
5725 IBT_ENDBRxx_WITHOUT_NOTRACK
5726 pinsrw mm0, A1_32, bImm
5727 ret
5728 %assign bImm bImm + 1
5729 %endrep
5730.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5731ENDPROC iemAImpl_pinsrw_u64
5732
5733BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5734 PROLOGUE_3_ARGS
5735 IEMIMPL_SSE_PROLOGUE
5736
5737 movzx A2, A2_8 ; must clear top bits
5738 movdqu xmm0, [A0]
5739 lea T1, [.imm0 xWrtRIP]
5740 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5741 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5742 %else
5743 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5744 %endif
5745 lea T1, [T1 + T0*2]
5746 IBT_NOTRACK
5747 call T1
5748 movdqu [A0], xmm0
5749
5750 IEMIMPL_SSE_EPILOGUE
5751 EPILOGUE_3_ARGS
5752 %assign bImm 0
5753 %rep 256
5754.imm %+ bImm:
5755 IBT_ENDBRxx_WITHOUT_NOTRACK
5756 pinsrw xmm0, A1_32, bImm
5757 ret
5758 %assign bImm bImm + 1
5759 %endrep
5760.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5761ENDPROC iemAImpl_pinsrw_u128
5762
5763;;
5764; vpinsrw instruction.
5765;
5766; @param A0 Pointer to the first media register size operand (output).
5767; @param A1 Pointer to the source media register size operand (input).
5768; @param A2 The 16 bit input operand (input).
5769; @param A3 The 8-bit immediate
5770;
5771BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5772 PROLOGUE_4_ARGS
5773 IEMIMPL_SSE_PROLOGUE
5774
5775 movzx A3, A3_8 ; must clear top bits
5776 movdqu xmm0, [A1]
5777 lea T1, [.imm0 xWrtRIP]
5778 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5779 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5780 %else
5781 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5782 %endif
5783 lea T1, [T1 + T0*2]
5784 mov A1, A2 ; A2 requires longer encoding on Windows
5785 IBT_NOTRACK
5786 call T1
5787 movdqu [A0], xmm0
5788
5789 IEMIMPL_SSE_EPILOGUE
5790 EPILOGUE_4_ARGS
5791 %assign bImm 0
5792 %rep 256
5793.imm %+ bImm:
5794 IBT_ENDBRxx_WITHOUT_NOTRACK
5795 vpinsrw xmm0, xmm0, A1_32, bImm
5796 ret
5797 %assign bImm bImm + 1
5798 %endrep
5799.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5800ENDPROC iemAImpl_vpinsrw_u128
5801
5802
5803;;
5804; pextrw instruction.
5805;
5806; @param A0 Pointer to the 16bit output operand (output).
5807; @param A1 Pointer to the media register size operand (input).
5808; @param A2 The 8-bit immediate
5809;
5810BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5811 PROLOGUE_3_ARGS
5812 IEMIMPL_SSE_PROLOGUE
5813
5814 movzx A2, A2_8 ; must clear top bits
5815 movq mm0, A1
5816 lea T1, [.imm0 xWrtRIP]
5817 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5818 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5819 %else
5820 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5821 %endif
5822 lea T1, [T1 + T0]
5823 IBT_NOTRACK
5824 call T1
5825 mov word [A0], T0_16
5826
5827 IEMIMPL_SSE_EPILOGUE
5828 EPILOGUE_3_ARGS
5829 %assign bImm 0
5830 %rep 256
5831.imm %+ bImm:
5832 IBT_ENDBRxx_WITHOUT_NOTRACK
5833 pextrw T0_32, mm0, bImm
5834 ret
5835 %assign bImm bImm + 1
5836 %endrep
5837.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5838ENDPROC iemAImpl_pextrw_u64
5839
5840BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5841 PROLOGUE_3_ARGS
5842 IEMIMPL_SSE_PROLOGUE
5843
5844 movzx A2, A2_8 ; must clear top bits
5845 movdqu xmm0, [A1]
5846 lea T1, [.imm0 xWrtRIP]
5847 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5848 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5849 %else
5850 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5851 %endif
5852 lea T1, [T1 + T0*2]
5853 IBT_NOTRACK
5854 call T1
5855 mov word [A0], T0_16
5856
5857 IEMIMPL_SSE_EPILOGUE
5858 EPILOGUE_3_ARGS
5859 %assign bImm 0
5860 %rep 256
5861.imm %+ bImm:
5862 IBT_ENDBRxx_WITHOUT_NOTRACK
5863 pextrw T0_32, xmm0, bImm
5864 ret
5865 %assign bImm bImm + 1
5866 %endrep
5867.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5868ENDPROC iemAImpl_pextrw_u128
5869
5870;;
5871; vpextrw instruction.
5872;
5873; @param A0 Pointer to the 16bit output operand (output).
5874; @param A1 Pointer to the source media register size operand (input).
5875; @param A2 The 8-bit immediate
5876;
5877BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5878 PROLOGUE_3_ARGS
5879 IEMIMPL_SSE_PROLOGUE
5880
5881 movzx A2, A2_8 ; must clear top bits
5882 movdqu xmm0, [A1]
5883 lea T1, [.imm0 xWrtRIP]
5884 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5885 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5886 %else
5887 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5888 %endif
5889 lea T1, [T1 + T0*2]
5890 IBT_NOTRACK
5891 call T1
5892 mov word [A0], T0_16
5893
5894 IEMIMPL_SSE_EPILOGUE
5895 EPILOGUE_3_ARGS
5896 %assign bImm 0
5897 %rep 256
5898.imm %+ bImm:
5899 IBT_ENDBRxx_WITHOUT_NOTRACK
5900 vpextrw T0_32, xmm0, bImm
5901 ret
5902 %assign bImm bImm + 1
5903 %endrep
5904.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5905ENDPROC iemAImpl_vpextrw_u128
5906
5907
5908;;
5909; movmskp{s,d} SSE instruction template
5910;
5911; @param 1 The SSE instruction name.
5912; @param 2 The AVX instruction name.
5913;
5914; @param A0 Pointer to the output register (output/byte sized).
5915; @param A1 Pointer to the source media register size operand (input).
5916;
5917%macro IEMIMPL_MEDIA_MOVMSK_P 2
5918BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5919 PROLOGUE_2_ARGS
5920 IEMIMPL_SSE_PROLOGUE
5921
5922 movdqu xmm0, [A1]
5923 %1 T0, xmm0
5924 mov byte [A0], T0_8
5925
5926 IEMIMPL_SSE_EPILOGUE
5927 EPILOGUE_2_ARGS
5928ENDPROC iemAImpl_ %+ %1 %+ _u128
5929
5930BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5931 PROLOGUE_2_ARGS
5932 IEMIMPL_AVX_PROLOGUE
5933
5934 movdqu xmm0, [A1]
5935 %2 T0, xmm0
5936 mov byte [A0], T0_8
5937
5938 IEMIMPL_AVX_EPILOGUE
5939 EPILOGUE_2_ARGS
5940ENDPROC iemAImpl_ %+ %2 %+ _u128
5941
5942BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5943 PROLOGUE_2_ARGS
5944 IEMIMPL_AVX_PROLOGUE
5945
5946 vmovdqu ymm0, [A1]
5947 %2 T0, ymm0
5948 mov byte [A0], T0_8
5949
5950 IEMIMPL_AVX_EPILOGUE
5951 EPILOGUE_2_ARGS
5952ENDPROC iemAImpl_ %+ %2 %+ _u256
5953%endmacro
5954
5955IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5956IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5957
5958
5959;;
5960; Restores the SSE MXCSR register with the original value.
5961;
5962; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5963; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5964; @param 2 Expression giving the address of the FXSTATE of the guest.
5965;
5966; @note Restores the stack pointer.
5967;
5968%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5969 sub xSP, 4
5970 stmxcsr [xSP]
5971 mov T0_32, [xSP]
5972 add xSP, 4
5973 ; Merge the status bits into the original MXCSR value.
5974 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5975 and T0_32, X86_MXCSR_XCPT_FLAGS
5976 or T0_32, T1_32
5977 mov [%1], T0_32
5978
5979 ldmxcsr [xSP]
5980 add xSP, 4
5981%endmacro
5982
5983
5984;;
5985; cvttsd2si instruction - 32-bit variant.
5986;
5987; @param A0 FPU context (FXSTATE or XSAVEAREA).
5988; @param A1 Where to return the MXCSR value.
5989; @param A2 Pointer to the result operand (output).
5990; @param A3 Pointer to the second operand (input).
5991;
5992BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5993 PROLOGUE_4_ARGS
5994 IEMIMPL_SSE_PROLOGUE
5995 SSE_LD_FXSTATE_MXCSR A0
5996
5997 cvttsd2si T0_32, [A3]
5998 mov dword [A2], T0_32
5999
6000 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6001 IEMIMPL_SSE_EPILOGUE
6002 EPILOGUE_4_ARGS
6003ENDPROC iemAImpl_cvttsd2si_i32_r64
6004
6005;;
6006; cvttsd2si instruction - 64-bit variant.
6007;
6008; @param A0 FPU context (FXSTATE or XSAVEAREA).
6009; @param A1 Where to return the MXCSR value.
6010; @param A2 Pointer to the result operand (output).
6011; @param A3 Pointer to the second operand (input).
6012;
6013BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6014 PROLOGUE_4_ARGS
6015 IEMIMPL_SSE_PROLOGUE
6016 SSE_LD_FXSTATE_MXCSR A0
6017
6018 cvttsd2si T0, [A3]
6019 mov qword [A2], T0
6020
6021 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6022 IEMIMPL_SSE_EPILOGUE
6023 EPILOGUE_4_ARGS
6024ENDPROC iemAImpl_cvttsd2si_i64_r64
6025
6026
6027;;
6028; cvtsd2si instruction - 32-bit variant.
6029;
6030; @param A0 FPU context (FXSTATE or XSAVEAREA).
6031; @param A1 Where to return the MXCSR value.
6032; @param A2 Pointer to the result operand (output).
6033; @param A3 Pointer to the second operand (input).
6034;
6035BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6036 PROLOGUE_4_ARGS
6037 IEMIMPL_SSE_PROLOGUE
6038 SSE_LD_FXSTATE_MXCSR A0
6039
6040 cvtsd2si T0_32, [A3]
6041 mov dword [A2], T0_32
6042
6043 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6044 IEMIMPL_SSE_EPILOGUE
6045 EPILOGUE_4_ARGS
6046ENDPROC iemAImpl_cvtsd2si_i32_r64
6047
6048;;
6049; cvtsd2si instruction - 64-bit variant.
6050;
6051; @param A0 FPU context (FXSTATE or XSAVEAREA).
6052; @param A1 Where to return the MXCSR value.
6053; @param A2 Pointer to the result operand (output).
6054; @param A3 Pointer to the second operand (input).
6055;
6056BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6057 PROLOGUE_4_ARGS
6058 IEMIMPL_SSE_PROLOGUE
6059 SSE_LD_FXSTATE_MXCSR A0
6060
6061 cvtsd2si T0, [A3]
6062 mov qword [A2], T0
6063
6064 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6065 IEMIMPL_SSE_EPILOGUE
6066 EPILOGUE_4_ARGS
6067ENDPROC iemAImpl_cvtsd2si_i64_r64
6068
6069
6070;;
6071; cvttss2si instruction - 32-bit variant.
6072;
6073; @param A0 FPU context (FXSTATE or XSAVEAREA).
6074; @param A1 Where to return the MXCSR value.
6075; @param A2 Pointer to the result operand (output).
6076; @param A3 Pointer to the second operand (input).
6077;
6078BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6079 PROLOGUE_4_ARGS
6080 IEMIMPL_SSE_PROLOGUE
6081 SSE_LD_FXSTATE_MXCSR A0
6082
6083 cvttss2si T0_32, [A3]
6084 mov dword [A2], T0_32
6085
6086 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6087 IEMIMPL_SSE_EPILOGUE
6088 EPILOGUE_4_ARGS
6089ENDPROC iemAImpl_cvttss2si_i32_r32
6090
6091;;
6092; cvttss2si instruction - 64-bit variant.
6093;
6094; @param A0 FPU context (FXSTATE or XSAVEAREA).
6095; @param A1 Where to return the MXCSR value.
6096; @param A2 Pointer to the result operand (output).
6097; @param A3 Pointer to the second operand (input).
6098;
6099BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6100 PROLOGUE_4_ARGS
6101 IEMIMPL_SSE_PROLOGUE
6102 SSE_LD_FXSTATE_MXCSR A0
6103
6104 cvttss2si T0, [A3]
6105 mov qword [A2], T0
6106
6107 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6108 IEMIMPL_SSE_EPILOGUE
6109 EPILOGUE_4_ARGS
6110ENDPROC iemAImpl_cvttss2si_i64_r32
6111
6112
6113;;
6114; cvtss2si instruction - 32-bit variant.
6115;
6116; @param A0 FPU context (FXSTATE or XSAVEAREA).
6117; @param A1 Where to return the MXCSR value.
6118; @param A2 Pointer to the result operand (output).
6119; @param A3 Pointer to the second operand (input).
6120;
6121BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6122 PROLOGUE_4_ARGS
6123 IEMIMPL_SSE_PROLOGUE
6124 SSE_LD_FXSTATE_MXCSR A0
6125
6126 cvtss2si T0_32, [A3]
6127 mov dword [A2], T0_32
6128
6129 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6130 IEMIMPL_SSE_EPILOGUE
6131 EPILOGUE_4_ARGS
6132ENDPROC iemAImpl_cvtss2si_i32_r32
6133
6134;;
6135; cvtss2si instruction - 64-bit variant.
6136;
6137; @param A0 FPU context (FXSTATE or XSAVEAREA).
6138; @param A1 Where to return the MXCSR value.
6139; @param A2 Pointer to the result operand (output).
6140; @param A3 Pointer to the second operand (input).
6141;
6142BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6143 PROLOGUE_4_ARGS
6144 IEMIMPL_SSE_PROLOGUE
6145 SSE_LD_FXSTATE_MXCSR A0
6146
6147 cvtss2si T0, [A3]
6148 mov qword [A2], T0
6149
6150 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6151 IEMIMPL_SSE_EPILOGUE
6152 EPILOGUE_4_ARGS
6153ENDPROC iemAImpl_cvtss2si_i64_r32
6154
6155
6156;;
6157; cvtsi2ss instruction - 32-bit variant.
6158;
6159; @param A0 FPU context (FXSTATE or XSAVEAREA).
6160; @param A1 Where to return the MXCSR value.
6161; @param A2 Pointer to the result operand (output).
6162; @param A3 Pointer to the second operand (input).
6163;
6164BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6165 PROLOGUE_4_ARGS
6166 IEMIMPL_SSE_PROLOGUE
6167 SSE_LD_FXSTATE_MXCSR A0
6168
6169 cvtsi2ss xmm0, dword [A3]
6170 movd dword [A2], xmm0
6171
6172 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6173 IEMIMPL_SSE_EPILOGUE
6174 EPILOGUE_4_ARGS
6175ENDPROC iemAImpl_cvtsi2ss_r32_i32
6176
6177;;
6178; cvtsi2ss instruction - 64-bit variant.
6179;
6180; @param A0 FPU context (FXSTATE or XSAVEAREA).
6181; @param A1 Where to return the MXCSR value.
6182; @param A2 Pointer to the result operand (output).
6183; @param A3 Pointer to the second operand (input).
6184;
6185BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6186 PROLOGUE_4_ARGS
6187 IEMIMPL_SSE_PROLOGUE
6188 SSE_LD_FXSTATE_MXCSR A0
6189
6190 cvtsi2ss xmm0, qword [A3]
6191 movd dword [A2], xmm0
6192
6193 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6194 IEMIMPL_SSE_EPILOGUE
6195 EPILOGUE_4_ARGS
6196ENDPROC iemAImpl_cvtsi2ss_r32_i64
6197
6198
6199;;
6200; cvtsi2sd instruction - 32-bit variant.
6201;
6202; @param A0 FPU context (FXSTATE or XSAVEAREA).
6203; @param A1 Where to return the MXCSR value.
6204; @param A2 Pointer to the result operand (output).
6205; @param A3 Pointer to the second operand (input).
6206;
6207BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6208 PROLOGUE_4_ARGS
6209 IEMIMPL_SSE_PROLOGUE
6210 SSE_LD_FXSTATE_MXCSR A0
6211
6212 cvtsi2sd xmm0, dword [A3]
6213 movq [A2], xmm0
6214
6215 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6216 IEMIMPL_SSE_EPILOGUE
6217 EPILOGUE_4_ARGS
6218ENDPROC iemAImpl_cvtsi2sd_r64_i32
6219
6220;;
6221; cvtsi2sd instruction - 64-bit variant.
6222;
6223; @param A0 FPU context (FXSTATE or XSAVEAREA).
6224; @param A1 Where to return the MXCSR value.
6225; @param A2 Pointer to the result operand (output).
6226; @param A3 Pointer to the second operand (input).
6227;
6228BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6229 PROLOGUE_4_ARGS
6230 IEMIMPL_SSE_PROLOGUE
6231 SSE_LD_FXSTATE_MXCSR A0
6232
6233 cvtsi2sd xmm0, qword [A3]
6234 movq [A2], xmm0
6235
6236 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6237 IEMIMPL_SSE_EPILOGUE
6238 EPILOGUE_4_ARGS
6239ENDPROC iemAImpl_cvtsi2sd_r64_i64
6240
6241
6242;;
6243; Initialize the SSE MXCSR register using the guest value partially to
6244; account for rounding mode.
6245;
6246; @uses 4 bytes of stack to save the original value, T0.
6247; @param 1 Expression giving the address of the MXCSR register of the guest.
6248;
6249%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6250 sub xSP, 4
6251
6252 stmxcsr [xSP]
6253 mov T0_32, [%1]
6254 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6255 or T0_32, X86_MXCSR_XCPT_MASK
6256 sub xSP, 4
6257 mov [xSP], T0_32
6258 ldmxcsr [xSP]
6259 add xSP, 4
6260%endmacro
6261
6262
6263;;
6264; Restores the SSE MXCSR register with the original value.
6265;
6266; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6267; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6268;
6269; @note Restores the stack pointer.
6270;
6271%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6272 sub xSP, 4
6273 stmxcsr [xSP]
6274 mov T0_32, [xSP]
6275 add xSP, 4
6276 ; Merge the status bits into the original MXCSR value.
6277 mov T1_32, [%1]
6278 and T0_32, X86_MXCSR_XCPT_FLAGS
6279 or T0_32, T1_32
6280 mov [%1], T0_32
6281
6282 ldmxcsr [xSP]
6283 add xSP, 4
6284%endmacro
6285
6286
6287;
6288; UCOMISS (SSE)
6289;
6290; @param A0 Pointer to the MXCSR value (input/output).
6291; @param A1 Pointer to the EFLAGS value (input/output).
6292; @param A2 Pointer to the first source operand (aka readonly destination).
6293; @param A3 Pointer to the second source operand.
6294;
6295BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6296 PROLOGUE_4_ARGS
6297 IEMIMPL_SSE_PROLOGUE
6298 SSE_LD_FXSTATE_MXCSR_ONLY A0
6299
6300 movdqu xmm0, [A2]
6301 movdqu xmm1, [A3]
6302 ucomiss xmm0, xmm1
6303 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6304
6305 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6306 IEMIMPL_SSE_EPILOGUE
6307 EPILOGUE_4_ARGS
6308ENDPROC iemAImpl_ucomiss_u128
6309
6310BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6311 PROLOGUE_4_ARGS
6312 IEMIMPL_SSE_PROLOGUE
6313 SSE_LD_FXSTATE_MXCSR_ONLY A0
6314
6315 movdqu xmm0, [A2]
6316 movdqu xmm1, [A3]
6317 vucomiss xmm0, xmm1
6318 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6319
6320 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6321 IEMIMPL_SSE_EPILOGUE
6322 EPILOGUE_4_ARGS
6323ENDPROC iemAImpl_vucomiss_u128
6324
6325
6326;
6327; UCOMISD (SSE)
6328;
6329; @param A0 Pointer to the MXCSR value (input/output).
6330; @param A1 Pointer to the EFLAGS value (input/output).
6331; @param A2 Pointer to the first source operand (aka readonly destination).
6332; @param A3 Pointer to the second source operand.
6333;
6334BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6335 PROLOGUE_4_ARGS
6336 IEMIMPL_SSE_PROLOGUE
6337 SSE_LD_FXSTATE_MXCSR_ONLY A0
6338
6339 movdqu xmm0, [A2]
6340 movdqu xmm1, [A3]
6341 ucomisd xmm0, xmm1
6342 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6343
6344 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6345 IEMIMPL_SSE_EPILOGUE
6346 EPILOGUE_4_ARGS
6347ENDPROC iemAImpl_ucomisd_u128
6348
6349BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6350 PROLOGUE_4_ARGS
6351 IEMIMPL_SSE_PROLOGUE
6352 SSE_LD_FXSTATE_MXCSR_ONLY A0
6353
6354 movdqu xmm0, [A2]
6355 movdqu xmm1, [A3]
6356 vucomisd xmm0, xmm1
6357 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6358
6359 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6360 IEMIMPL_SSE_EPILOGUE
6361 EPILOGUE_4_ARGS
6362ENDPROC iemAImpl_vucomisd_u128
6363
6364;
6365; COMISS (SSE)
6366;
6367; @param A0 Pointer to the MXCSR value (input/output).
6368; @param A1 Pointer to the EFLAGS value (input/output).
6369; @param A2 Pointer to the first source operand (aka readonly destination).
6370; @param A3 Pointer to the second source operand.
6371;
6372BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6373 PROLOGUE_4_ARGS
6374 IEMIMPL_SSE_PROLOGUE
6375 SSE_LD_FXSTATE_MXCSR_ONLY A0
6376
6377 movdqu xmm0, [A2]
6378 movdqu xmm1, [A3]
6379 comiss xmm0, xmm1
6380 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6381
6382 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6383 IEMIMPL_SSE_EPILOGUE
6384 EPILOGUE_4_ARGS
6385ENDPROC iemAImpl_comiss_u128
6386
6387BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6388 PROLOGUE_4_ARGS
6389 IEMIMPL_SSE_PROLOGUE
6390 SSE_LD_FXSTATE_MXCSR_ONLY A0
6391
6392 movdqu xmm0, [A2]
6393 movdqu xmm1, [A3]
6394 vcomiss xmm0, xmm1
6395 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6396
6397 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6398 IEMIMPL_SSE_EPILOGUE
6399 EPILOGUE_4_ARGS
6400ENDPROC iemAImpl_vcomiss_u128
6401
6402
6403;
6404; COMISD (SSE)
6405;
6406; @param A0 Pointer to the MXCSR value (input/output).
6407; @param A1 Pointer to the EFLAGS value (input/output).
6408; @param A2 Pointer to the first source operand (aka readonly destination).
6409; @param A3 Pointer to the second source operand.
6410;
6411BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6412 PROLOGUE_4_ARGS
6413 IEMIMPL_SSE_PROLOGUE
6414 SSE_LD_FXSTATE_MXCSR_ONLY A0
6415
6416 movdqu xmm0, [A2]
6417 movdqu xmm1, [A3]
6418 comisd xmm0, xmm1
6419 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6420
6421 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6422 IEMIMPL_SSE_EPILOGUE
6423 EPILOGUE_4_ARGS
6424ENDPROC iemAImpl_comisd_u128
6425
6426BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6427 PROLOGUE_4_ARGS
6428 IEMIMPL_SSE_PROLOGUE
6429 SSE_LD_FXSTATE_MXCSR_ONLY A0
6430
6431 movdqu xmm0, [A2]
6432 movdqu xmm1, [A3]
6433 vcomisd xmm0, xmm1
6434 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6435
6436 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6437 IEMIMPL_SSE_EPILOGUE
6438 EPILOGUE_4_ARGS
6439ENDPROC iemAImpl_vcomisd_u128
6440
6441
6442;;
6443; Need to move this as well somewhere better?
6444;
6445struc IEMMEDIAF2XMMSRC
6446 .uSrc1 resd 4
6447 .uSrc2 resd 4
6448endstruc
6449
6450
6451;
6452; CMPPS (SSE)
6453;
6454; @param A0 Pointer to the MXCSR value (input/output).
6455; @param A1 Pointer to the first media register size operand (output).
6456; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6457; @param A3 The 8-bit immediate (input).
6458;
6459BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6460 PROLOGUE_4_ARGS
6461 IEMIMPL_SSE_PROLOGUE
6462 SSE_LD_FXSTATE_MXCSR_ONLY A0
6463
6464 movzx A3, A3_8 ; must clear top bits
6465 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6466 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6467 lea T1, [.imm0 xWrtRIP]
6468 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6469 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6470 %else
6471 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6472 %endif
6473 lea T1, [T1 + T0]
6474 IBT_NOTRACK
6475 call T1
6476 movdqu [A1], xmm0
6477
6478 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6479 IEMIMPL_SSE_EPILOGUE
6480 EPILOGUE_4_ARGS
6481 %assign bImm 0
6482 %rep 256
6483.imm %+ bImm:
6484 IBT_ENDBRxx_WITHOUT_NOTRACK
6485 cmpps xmm0, xmm1, bImm
6486 ret
6487 %assign bImm bImm + 1
6488 %endrep
6489.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6490ENDPROC iemAImpl_cmpps_u128
6491
6492;;
6493; SSE instructions with 8-bit immediates of the form
6494; xxx xmm1, xmm2, imm8.
6495; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6496; register.
6497;
6498; @param 1 The instruction name.
6499;
6500; @param A0 Pointer to the MXCSR value (input/output).
6501; @param A1 Pointer to the first media register size operand (output).
6502; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6503; @param A3 The 8-bit immediate (input).
6504;
6505%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6507 PROLOGUE_4_ARGS
6508 IEMIMPL_SSE_PROLOGUE
6509 SSE_LD_FXSTATE_MXCSR_ONLY A0
6510
6511 movzx A3, A3_8 ; must clear top bits
6512 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6513 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6514 lea T1, [.imm0 xWrtRIP]
6515 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6516 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6517 %else
6518 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6519 %endif
6520 lea T1, [T1 + T0*2]
6521 IBT_NOTRACK
6522 call T1
6523 movdqu [A1], xmm0
6524
6525 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6526 IEMIMPL_SSE_EPILOGUE
6527 EPILOGUE_4_ARGS
6528 %assign bImm 0
6529 %rep 256
6530.imm %+ bImm:
6531 IBT_ENDBRxx_WITHOUT_NOTRACK
6532 %1 xmm0, xmm1, bImm
6533 ret
6534 %assign bImm bImm + 1
6535 %endrep
6536.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6537ENDPROC iemAImpl_ %+ %1 %+ _u128
6538%endmacro
6539
6540IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6541IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6542IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6543
6544;;
6545; SSE instructions with 8-bit immediates of the form
6546; xxx xmm1, xmm2, imm8.
6547; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6548; register.
6549;
6550; @param 1 The instruction name.
6551;
6552; @param A0 Pointer to the MXCSR value (input/output).
6553; @param A1 Pointer to the first media register size operand (output).
6554; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6555; @param A3 The 8-bit immediate (input).
6556;
6557%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6558BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6559 PROLOGUE_4_ARGS
6560 IEMIMPL_SSE_PROLOGUE
6561 SSE_LD_FXSTATE_MXCSR_ONLY A0
6562
6563 movzx A3, A3_8 ; must clear top bits
6564 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6565 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6566 lea T1, [.imm0 xWrtRIP]
6567 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6568 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6569 lea T1, [T1 + T0*4]
6570 %else
6571 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6572 %endif
6573 IBT_NOTRACK
6574 call T1
6575 movdqu [A1], xmm0
6576
6577 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6578 IEMIMPL_SSE_EPILOGUE
6579 EPILOGUE_4_ARGS
6580 %assign bImm 0
6581 %rep 256
6582.imm %+ bImm:
6583 IBT_ENDBRxx_WITHOUT_NOTRACK
6584 %1 xmm0, xmm1, bImm
6585 ret
6586 int3
6587 %assign bImm bImm + 1
6588 %endrep
6589.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6590ENDPROC iemAImpl_ %+ %1 %+ _u128
6591%endmacro
6592
6593IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6594IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6595IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6596IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6597IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6598IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6599
6600
6601;;
6602; SSE instructions of the form
6603; xxx mm, xmm.
6604; and we need to load and save the MXCSR register.
6605;
6606; @param 1 The instruction name.
6607;
6608; @param A0 Pointer to the MXCSR value (input/output).
6609; @param A1 Pointer to the first MMX register sized operand (output).
6610; @param A2 Pointer to the media register sized operand (input).
6611;
6612%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6613BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6614 PROLOGUE_3_ARGS
6615 IEMIMPL_SSE_PROLOGUE
6616 SSE_LD_FXSTATE_MXCSR_ONLY A0
6617
6618 movdqu xmm0, [A2]
6619 %1 mm0, xmm0
6620 movq [A1], mm0
6621
6622 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6623 IEMIMPL_SSE_EPILOGUE
6624 EPILOGUE_3_ARGS
6625ENDPROC iemAImpl_ %+ %1 %+ _u128
6626%endmacro
6627
6628IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6629IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6630
6631;;
6632; SSE instructions of the form
6633; xxx xmm, xmm/m64.
6634; and we need to load and save the MXCSR register.
6635;
6636; @param 1 The instruction name.
6637;
6638; @param A0 Pointer to the MXCSR value (input/output).
6639; @param A1 Pointer to the first media register sized operand (input/output).
6640; @param A2 The 64bit source value from a MMX media register (input)
6641;
6642%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6643BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6644 PROLOGUE_3_ARGS
6645 IEMIMPL_SSE_PROLOGUE
6646 SSE_LD_FXSTATE_MXCSR_ONLY A0
6647
6648 movdqu xmm0, [A1]
6649 movq mm0, A2
6650 %1 xmm0, mm0
6651 movdqu [A1], xmm0
6652
6653 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6654 IEMIMPL_SSE_EPILOGUE
6655 EPILOGUE_3_ARGS
6656ENDPROC iemAImpl_ %+ %1 %+ _u128
6657%endmacro
6658
6659IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6660IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6661
6662;;
6663; SSE instructions of the form
6664; xxx mm, xmm/m64.
6665; and we need to load and save the MXCSR register.
6666;
6667; @param 1 The instruction name.
6668;
6669; @param A0 Pointer to the MXCSR value (input/output).
6670; @param A1 Pointer to the first MMX media register sized operand (output).
6671; @param A2 The 64bit source value (input).
6672;
6673%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6674BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6675 PROLOGUE_3_ARGS
6676 IEMIMPL_SSE_PROLOGUE
6677 SSE_LD_FXSTATE_MXCSR_ONLY A0
6678
6679 movq xmm0, A2
6680 %1 mm0, xmm0
6681 movq [A1], mm0
6682
6683 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6684 IEMIMPL_SSE_EPILOGUE
6685 EPILOGUE_3_ARGS
6686ENDPROC iemAImpl_ %+ %1 %+ _u128
6687%endmacro
6688
6689IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6690IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6691
6692;
6693; All forms of RDRAND and RDSEED
6694;
6695; @param A0 Pointer to the destination operand.
6696; @param A1 Pointer to the EFLAGS value (input/output).
6697;
6698%macro IEMIMPL_RDRAND_RDSEED 3
6699BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6700 PROLOGUE_2_ARGS
6701
6702 %1 %2
6703 mov [A0], %2
6704 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6705
6706 EPILOGUE_2_ARGS
6707ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6708%endmacro
6709
6710IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6711IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6712IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6713IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6714IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6715IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6716
6717
6718;;
6719; sha1rnds4 xmm1, xmm2, imm8.
6720;
6721; @param 1 The instruction name.
6722;
6723; @param A0 Pointer to the first media register size operand (input/output).
6724; @param A1 Pointer to the second source media register size operand (input).
6725; @param A2 The 8-bit immediate
6726;
6727BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6728 PROLOGUE_3_ARGS
6729 IEMIMPL_SSE_PROLOGUE
6730
6731 movzx A2, A2_8 ; must clear top bits
6732 movdqu xmm0, [A0]
6733 movdqu xmm1, [A1]
6734 lea T1, [.imm0 xWrtRIP]
6735 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6736 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6737 %else
6738 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6739 %endif
6740 lea T1, [T1 + T0*2]
6741 IBT_NOTRACK
6742 call T1
6743 movdqu [A0], xmm0
6744
6745 IEMIMPL_SSE_EPILOGUE
6746 EPILOGUE_3_ARGS
6747 %assign bImm 0
6748 %rep 256
6749.imm %+ bImm:
6750 IBT_ENDBRxx_WITHOUT_NOTRACK
6751 sha1rnds4 xmm0, xmm1, bImm
6752 ret
6753 %assign bImm bImm + 1
6754 %endrep
6755.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6756ENDPROC iemAImpl_sha1rnds4_u128
6757
6758
6759;;
6760; sha256rnds2 xmm1, xmm2, <XMM0>.
6761;
6762; @param 1 The instruction name.
6763;
6764; @param A0 Pointer to the first media register size operand (input/output).
6765; @param A1 Pointer to the second source media register size operand (input).
6766; @param A2 Pointer to the implicit XMM0 constants (input).
6767;
6768BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6769 PROLOGUE_3_ARGS
6770 IEMIMPL_SSE_PROLOGUE
6771
6772 movdqu xmm0, [A2]
6773 movdqu xmm1, [A0]
6774 movdqu xmm2, [A1]
6775 sha256rnds2 xmm1, xmm2
6776 movdqu [A0], xmm1
6777
6778 IEMIMPL_SSE_EPILOGUE
6779 EPILOGUE_3_ARGS
6780ENDPROC iemAImpl_sha256rnds2_u128
6781
6782
6783;
6784; 32-bit forms of ADCX and ADOX
6785;
6786; @param A0 Pointer to the destination operand (input/output).
6787; @param A1 32-bit source operand 1 (input).
6788; @param A2 Pointer to the EFLAGS value (input/output).
6789;
6790%macro IEMIMPL_ADX_32 2
6791BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6792 PROLOGUE_4_ARGS
6793
6794 IEM_LOAD_FLAGS A2, %2, 0
6795 %1 A1_32, [A0]
6796 mov [A0], A1_32
6797 IEM_SAVE_FLAGS A2, %2, 0
6798
6799 EPILOGUE_4_ARGS
6800ENDPROC iemAImpl_ %+ %1 %+ _u32
6801%endmacro
6802
6803;
6804; 64-bit forms of ADCX and ADOX
6805;
6806; @param A0 Pointer to the destination operand (input/output).
6807; @param A1 64-bit source operand 1 (input).
6808; @param A2 Pointer to the EFLAGS value (input/output).
6809;
6810%macro IEMIMPL_ADX_64 2
6811BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6812 PROLOGUE_4_ARGS
6813
6814 IEM_LOAD_FLAGS A2, %2, 0
6815 %1 A1, [A0]
6816 mov [A0], A1
6817 IEM_SAVE_FLAGS A2, %2, 0
6818
6819 EPILOGUE_4_ARGS
6820ENDPROC iemAImpl_ %+ %1 %+ _u64
6821%endmacro
6822
6823IEMIMPL_ADX_32 adcx, X86_EFL_CF
6824IEMIMPL_ADX_64 adcx, X86_EFL_CF
6825
6826IEMIMPL_ADX_32 adox, X86_EFL_OF
6827IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette