VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 103649

Last change on this file since 103649 was 103558, checked in by vboxsync, 9 months ago

VMM/IEM: Implement vpermilpd instruction emulations, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 198.4 KB
Line 
1; $Id: IEMAllAImpl.asm 103558 2024-02-24 11:06:53Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283; @param 4 Force loading the flags.
284;
285%macro IEM_MAYBE_LOAD_FLAGS 3-4 1
286 %if (%3 + %4) != 0
287 pushf ; store current flags
288 mov T0_32, [%1] ; load the guest flags
289 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
290 and T0_32, (%2 | %3) ; select the modified and undefined flags.
291 or [xSP], T0 ; merge guest flags with host flags.
292 popf ; load the mixed flags.
293 %endif
294%endmacro
295
296;;
297; Load the relevant flags from [%1].
298;
299; @remarks Clobbers T0, stack. Changes EFLAGS.
300; @param A2 The register pointing to the flags.
301; @param 1 The parameter (A0..A3) pointing to the eflags.
302; @param 2 The set of flags to load.
303; @param 3 The set of undefined flags.
304;
305%macro IEM_LOAD_FLAGS 3
306 pushf ; store current flags
307 mov T0_32, [%1] ; load the guest flags
308 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
309 and T0_32, (%2 | %3) ; select the modified and undefined flags.
310 or [xSP], T0 ; merge guest flags with host flags.
311 popf ; load the mixed flags.
312%endmacro
313
314;;
315; Update the flag.
316;
317; @remarks Clobbers T0, T1, stack.
318; @param 1 The register pointing to the EFLAGS.
319; @param 2 The mask of modified flags to save.
320; @param 3 The mask of undefined flags to (maybe) save.
321;
322%macro IEM_SAVE_FLAGS 3
323 %if (%2 | %3) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; flags
327 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
328 and T1_32, (%2 | %3) ; select the modified and undefined flags.
329 or T0_32, T1_32 ; combine the flags.
330 mov [%1], T0_32 ; save the flags.
331 %endif
332%endmacro
333
334;;
335; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
336;
337; @remarks Clobbers T0, T1, stack.
338; @param 1 The register pointing to the EFLAGS.
339; @param 2 The mask of modified flags to save.
340; @param 3 Mask of additional flags to always clear
341; @param 4 Mask of additional flags to always set.
342;
343%macro IEM_SAVE_AND_ADJUST_FLAGS 4
344 %if (%2 | %3 | %4) != 0
345 pushf
346 pop T1
347 mov T0_32, [%1] ; load flags.
348 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
349 and T1_32, (%2) ; select the modified flags.
350 or T0_32, T1_32 ; combine the flags.
351 %if (%4) != 0
352 or T0_32, %4 ; add the always set flags.
353 %endif
354 mov [%1], T0_32 ; save the result.
355 %endif
356%endmacro
357
358;;
359; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
360; signed input (%4[%5]) and parity index (%6).
361;
362; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
363; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
364; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
365;
366; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
367; @param 1 The register pointing to the EFLAGS.
368; @param 2 The mask of modified flags to save.
369; @param 3 Mask of additional flags to always clear
370; @param 4 The result register to set SF by.
371; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
372; @param 6 The (full) register containing the parity table index. Will be modified!
373
374%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
375 %ifdef RT_ARCH_AMD64
376 pushf
377 pop T2
378 %else
379 push T0
380 pushf
381 pop T0
382 %endif
383 mov T1_32, [%1] ; load flags.
384 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
385 %ifdef RT_ARCH_AMD64
386 and T2_32, (%2) ; select the modified flags.
387 or T1_32, T2_32 ; combine the flags.
388 %else
389 and T0_32, (%2) ; select the modified flags.
390 or T1_32, T0_32 ; combine the flags.
391 pop T0
392 %endif
393
394 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
395 bt %4, %5 - 1
396 jnc %%sf_clear
397 or T1_32, X86_EFL_SF
398 %%sf_clear:
399
400 ; Parity last.
401 and %6, 0xff
402 %ifdef RT_ARCH_AMD64
403 lea T2, [NAME(g_afParity) xWrtRIP]
404 or T1_8, [T2 + %6]
405 %else
406 or T1_8, [NAME(g_afParity) + %6]
407 %endif
408
409 mov [%1], T1_32 ; save the result.
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419;
420%macro IEM_ADJUST_FLAGS 3
421 %if (%2 | %3) != 0
422 mov T0_32, [%1] ; Load flags.
423 %if (%2) != 0
424 and T0_32, ~(%2) ; Remove the always cleared flags.
425 %endif
426 %if (%3) != 0
427 or T0_32, %3 ; Add the always set flags.
428 %endif
429 mov [%1], T0_32 ; Save the result.
430 %endif
431%endmacro
432
433;;
434; Calculates the new EFLAGS using fixed clear and set bit masks.
435;
436; @remarks Clobbers T0, %4, EFLAGS.
437; @param 1 The register pointing to the EFLAGS.
438; @param 2 Mask of additional flags to always clear
439; @param 3 Mask of additional flags to always set.
440; @param 4 The (full) register containing the parity table index. Will be modified!
441;
442%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
443 mov T0_32, [%1] ; Load flags.
444 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
445 %if (%3) != 0
446 or T0_32, %3 ; Add the always set flags.
447 %endif
448 and %4, 0xff
449 %ifdef RT_ARCH_AMD64
450 lea T2, [NAME(g_afParity) xWrtRIP]
451 or T0_8, [T2 + %4]
452 %else
453 or T0_8, [NAME(g_afParity) + %4]
454 %endif
455 mov [%1], T0_32 ; Save the result.
456%endmacro
457
458
459;;
460; Checks that the size expression %1 matches %2 adjusted according to
461; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
462; @param 1 The jump array size assembly expression.
463; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
464;
465%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
467 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
469 %else
470 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
471 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
472 %endif
473%endmacro
474
475
476;*********************************************************************************************************************************
477;* External Symbols *
478;*********************************************************************************************************************************
479extern NAME(g_afParity)
480
481
482;;
483; Macro for implementing a binary operator.
484;
485; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
486; variants, except on 32-bit system where the 64-bit accesses requires hand
487; coding.
488;
489; All the functions takes a pointer to the destination memory operand in A0,
490; the source register operand in A1 and a pointer to eflags in A2.
491;
492; @param 1 The instruction mnemonic.
493; @param 2 Non-zero if there should be a locked version.
494; @param 3 The modified flags.
495; @param 4 The undefined flags.
496; @param 5 Force flag loading (ADC, SBC).
497;
498%macro IEMIMPL_BIN_OP 5
499BEGINCODE
500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
501 PROLOGUE_3_ARGS
502 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
503 %1 byte [A0], A1_8
504 IEM_SAVE_FLAGS A2, %3, %4
505 EPILOGUE_3_ARGS
506ENDPROC iemAImpl_ %+ %1 %+ _u8
507
508BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509 PROLOGUE_3_ARGS
510 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
511 %1 word [A0], A1_16
512 IEM_SAVE_FLAGS A2, %3, %4
513 EPILOGUE_3_ARGS
514ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517 PROLOGUE_3_ARGS
518 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
519 %1 dword [A0], A1_32
520 IEM_SAVE_FLAGS A2, %3, %4
521 EPILOGUE_3_ARGS
522ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524 %ifdef RT_ARCH_AMD64
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
528 %1 qword [A0], A1
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS_EX 8
531ENDPROC iemAImpl_ %+ %1 %+ _u64
532 %endif ; RT_ARCH_AMD64
533
534 %if %2 != 0 ; locked versions requested?
535
536BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
537 PROLOGUE_3_ARGS
538 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
539 lock %1 byte [A0], A1_8
540 IEM_SAVE_FLAGS A2, %3, %4
541 EPILOGUE_3_ARGS
542ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
543
544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
545 PROLOGUE_3_ARGS
546 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
547 lock %1 word [A0], A1_16
548 IEM_SAVE_FLAGS A2, %3, %4
549 EPILOGUE_3_ARGS
550ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
551
552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
553 PROLOGUE_3_ARGS
554 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
555 lock %1 dword [A0], A1_32
556 IEM_SAVE_FLAGS A2, %3, %4
557 EPILOGUE_3_ARGS
558ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
559
560 %ifdef RT_ARCH_AMD64
561BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
562 PROLOGUE_3_ARGS
563 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
564 lock %1 qword [A0], A1
565 IEM_SAVE_FLAGS A2, %3, %4
566 EPILOGUE_3_ARGS_EX 8
567ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
568 %endif ; RT_ARCH_AMD64
569 %endif ; locked
570%endmacro
571
572; instr,lock, modified-flags, undefined flags, force loading flags
573IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
574IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
575IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
576IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
577IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
578IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
579IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
580IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
581IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
582
583
584;;
585; Macro for implementing a binary operator, VEX variant with separate input/output.
586;
587; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
588; where the 64-bit accesses requires hand coding.
589;
590; All the functions takes a pointer to the destination memory operand in A0,
591; the first source register operand in A1, the second source register operand
592; in A2 and a pointer to eflags in A3.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
602 %1 T0_32, A1_32, A2_32
603 mov [A0], T0_32
604 IEM_SAVE_FLAGS A3, %2, %3
605 EPILOGUE_4_ARGS
606ENDPROC iemAImpl_ %+ %1 %+ _u32
607
608 %ifdef RT_ARCH_AMD64
609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
610 PROLOGUE_4_ARGS
611 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
612 %1 T0, A1, A2
613 mov [A0], T0
614 IEM_SAVE_FLAGS A3, %2, %3
615 EPILOGUE_4_ARGS
616ENDPROC iemAImpl_ %+ %1 %+ _u64
617 %endif ; RT_ARCH_AMD64
618%endmacro
619
620; instr, modified-flags, undefined-flags
621IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
623IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624
625;;
626; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
627;
628; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
629; where the 64-bit accesses requires hand coding.
630;
631; All the functions takes a pointer to the destination memory operand in A0,
632; the source register operand in A1 and a pointer to eflags in A2.
633;
634; @param 1 The instruction mnemonic.
635; @param 2 The modified flags.
636; @param 3 The undefined flags.
637;
638%macro IEMIMPL_VEX_BIN_OP_2 3
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0_32, [A0]
643 %1 T0_32, A1_32
644 mov [A0], T0_32
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u32
648
649 %ifdef RT_ARCH_AMD64
650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
651 PROLOGUE_4_ARGS
652 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
653 mov T0, [A0]
654 %1 T0, A1
655 mov [A0], T0
656 IEM_SAVE_FLAGS A2, %2, %3
657 EPILOGUE_4_ARGS
658ENDPROC iemAImpl_ %+ %1 %+ _u64
659 %endif ; RT_ARCH_AMD64
660%endmacro
661
662; instr, modified-flags, undefined-flags
663IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
665IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
666
667
668;;
669; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
670;
671; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
672; where the 64-bit accesses requires hand coding.
673;
674; All the functions takes a pointer to the destination memory operand in A0,
675; the first source register operand in A1, the second source register operand
676; in A2 and a pointer to eflags in A3.
677;
678; @param 1 The instruction mnemonic.
679; @param 2 Fallback instruction if applicable.
680; @param 3 Whether to emit fallback or not.
681;
682%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
684 PROLOGUE_3_ARGS
685 %1 T0_32, A1_32, A2_32
686 mov [A0], T0_32
687 EPILOGUE_3_ARGS
688ENDPROC iemAImpl_ %+ %1 %+ _u32
689
690 %if %3
691BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
692 PROLOGUE_3_ARGS
693 %ifdef ASM_CALL64_GCC
694 mov cl, A2_8
695 %2 A1_32, cl
696 mov [A0], A1_32
697 %else
698 xchg A2, A0
699 %2 A1_32, cl
700 mov [A2], A1_32
701 %endif
702 EPILOGUE_3_ARGS
703ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
704 %endif
705
706 %ifdef RT_ARCH_AMD64
707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
708 PROLOGUE_3_ARGS
709 %1 T0, A1, A2
710 mov [A0], T0
711 EPILOGUE_3_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u64
713
714 %if %3
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
716 PROLOGUE_3_ARGS
717 %ifdef ASM_CALL64_GCC
718 mov cl, A2_8
719 %2 A1, cl
720 mov [A0], A1_32
721 %else
722 xchg A2, A0
723 %2 A1, cl
724 mov [A2], A1_32
725 %endif
726 mov [A0], A1
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
729 %endif
730 %endif ; RT_ARCH_AMD64
731%endmacro
732
733; instr, fallback instr, emit fallback
734IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
735IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
736IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
737IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
738IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
739
740
741;
742; RORX uses a immediate byte for the shift count, so we only do
743; fallback implementation of that one.
744;
745BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
746 PROLOGUE_3_ARGS
747 %ifdef ASM_CALL64_GCC
748 mov cl, A2_8
749 ror A1_32, cl
750 mov [A0], A1_32
751 %else
752 xchg A2, A0
753 ror A1_32, cl
754 mov [A2], A1_32
755 %endif
756 EPILOGUE_3_ARGS
757ENDPROC iemAImpl_rorx_u32
758
759 %ifdef RT_ARCH_AMD64
760BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
761 PROLOGUE_3_ARGS
762 %ifdef ASM_CALL64_GCC
763 mov cl, A2_8
764 ror A1, cl
765 mov [A0], A1
766 %else
767 xchg A2, A0
768 ror A1, cl
769 mov [A2], A1
770 %endif
771 EPILOGUE_3_ARGS
772ENDPROC iemAImpl_rorx_u64
773 %endif ; RT_ARCH_AMD64
774
775
776;
777; MULX
778;
779BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2_32 is EDX - prefect
783 mulx T0_32, T1_32, A3_32
784 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0_32
786%else
787 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
788 xchg A1, A2
789 mulx T0_32, T1_32, A3_32
790 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0_32
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u32
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2_32 is EDX, T0_32 is EAX
801 mov eax, A3_32
802 mul A2_32
803 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], edx
805%else
806 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
807 xchg A1, A2
808 mov eax, A3_32
809 mul A2_32
810 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u32_fallback
815
816%ifdef RT_ARCH_AMD64
817BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
818 PROLOGUE_4_ARGS
819%ifdef ASM_CALL64_GCC
820 ; A2 is RDX - prefect
821 mulx T0, T1, A3
822 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
823 mov [A0], T0
824%else
825 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
826 xchg A1, A2
827 mulx T0, T1, A3
828 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
829 mov [A0], T0
830%endif
831 EPILOGUE_4_ARGS
832ENDPROC iemAImpl_mulx_u64
833
834
835BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
836 PROLOGUE_4_ARGS
837%ifdef ASM_CALL64_GCC
838 ; A2 is RDX, T0 is RAX
839 mov rax, A3
840 mul A2
841 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
842 mov [A0], rdx
843%else
844 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
845 xchg A1, A2
846 mov rax, A3
847 mul A2
848 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%endif
851 EPILOGUE_4_ARGS
852ENDPROC iemAImpl_mulx_u64_fallback
853
854%endif
855
856
857;;
858; Macro for implementing a bit operator.
859;
860; This will generate code for the 16, 32 and 64 bit accesses with locked
861; variants, except on 32-bit system where the 64-bit accesses requires hand
862; coding.
863;
864; All the functions takes a pointer to the destination memory operand in A0,
865; the source register operand in A1 and a pointer to eflags in A2.
866;
867; @param 1 The instruction mnemonic.
868; @param 2 Non-zero if there should be a locked version.
869; @param 3 The modified flags.
870; @param 4 The undefined flags.
871;
872%macro IEMIMPL_BIT_OP 4
873BEGINCODE
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64
898 %endif ; RT_ARCH_AMD64
899
900 %if %2 != 0 ; locked versions requested?
901
902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
903 PROLOGUE_3_ARGS
904 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
905 lock %1 word [A0], A1_16
906 IEM_SAVE_FLAGS A2, %3, %4
907 EPILOGUE_3_ARGS
908ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
913 lock %1 dword [A0], A1_32
914 IEM_SAVE_FLAGS A2, %3, %4
915 EPILOGUE_3_ARGS
916ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
917
918 %ifdef RT_ARCH_AMD64
919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
920 PROLOGUE_3_ARGS
921 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
922 lock %1 qword [A0], A1
923 IEM_SAVE_FLAGS A2, %3, %4
924 EPILOGUE_3_ARGS_EX 8
925ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
926 %endif ; RT_ARCH_AMD64
927 %endif ; locked
928%endmacro
929; modified efl, undefined eflags
930IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
932IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
933IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
934
935;;
936; Macro for implementing a bit search operator.
937;
938; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
939; system where the 64-bit accesses requires hand coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; In the ZF case the destination register is 'undefined', however it seems that
945; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
946; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
947; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
948; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
949;
950; @param 1 The instruction mnemonic.
951; @param 2 The modified flags.
952; @param 3 The undefined flags.
953; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
954;
955%macro IEMIMPL_BIT_OP2 4
956BEGINCODE
957BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
958 PROLOGUE_3_ARGS
959 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
960 %1 T0_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T0_16
965.unchanged_dst:
966 IEM_SAVE_FLAGS A2, %2, %3
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
971 PROLOGUE_3_ARGS
972 %1 T1_16, A1_16
973%if %4 != 0
974 jz .unchanged_dst
975%endif
976 mov [A0], T1_16
977 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
978 EPILOGUE_3_ARGS
979.unchanged_dst:
980 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
983
984BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
985 PROLOGUE_3_ARGS
986 %1 T0_16, A1_16
987%if %4 != 0
988 jz .unchanged_dst
989%endif
990 mov [A0], T0_16
991.unchanged_dst:
992 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
995
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1000 %1 T0_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T0_32
1005.unchanged_dst:
1006 IEM_SAVE_FLAGS A2, %2, %3
1007 EPILOGUE_3_ARGS
1008ENDPROC iemAImpl_ %+ %1 %+ _u32
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1011 PROLOGUE_3_ARGS
1012 %1 T1_32, A1_32
1013%if %4 != 0
1014 jz .unchanged_dst
1015%endif
1016 mov [A0], T1_32
1017 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1018 EPILOGUE_3_ARGS
1019.unchanged_dst:
1020 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1023
1024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1025 PROLOGUE_3_ARGS
1026 %1 T0_32, A1_32
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T0_32
1031.unchanged_dst:
1032 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1033 EPILOGUE_3_ARGS
1034ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1035
1036
1037 %ifdef RT_ARCH_AMD64
1038
1039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1040 PROLOGUE_3_ARGS
1041 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1042 %1 T0, A1
1043%if %4 != 0
1044 jz .unchanged_dst
1045%endif
1046 mov [A0], T0
1047.unchanged_dst:
1048 IEM_SAVE_FLAGS A2, %2, %3
1049 EPILOGUE_3_ARGS_EX 8
1050ENDPROC iemAImpl_ %+ %1 %+ _u64
1051
1052BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1053 PROLOGUE_3_ARGS
1054 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1055 %1 T1, A1
1056%if %4 != 0
1057 jz .unchanged_dst
1058%endif
1059 mov [A0], T1
1060 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1061 EPILOGUE_3_ARGS
1062.unchanged_dst:
1063 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1064 EPILOGUE_3_ARGS
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1066
1067BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1068 PROLOGUE_3_ARGS
1069 %1 T0, A1
1070%if %4 != 0
1071 jz .unchanged_dst
1072%endif
1073 mov [A0], T0
1074.unchanged_dst:
1075 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1076 EPILOGUE_3_ARGS_EX 8
1077ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1078
1079 %endif ; RT_ARCH_AMD64
1080%endmacro
1081
1082IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1083IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1084IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1085IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1086
1087
1088;;
1089; Macro for implementing POPCNT.
1090;
1091; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1092; system where the 64-bit accesses requires hand coding.
1093;
1094; All the functions takes a pointer to the destination memory operand in A0,
1095; the source register operand in A1 and a pointer to eflags in A2.
1096;
1097; ASSUMES Intel and AMD set EFLAGS the same way.
1098;
1099; ASSUMES the instruction does not support memory destination.
1100;
1101; @param 1 The instruction mnemonic.
1102; @param 2 The modified flags.
1103; @param 3 The undefined flags.
1104;
1105%macro IEMIMPL_BIT_OP3 3
1106BEGINCODE
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1108 PROLOGUE_3_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1110 %1 T0_16, A1_16
1111 mov [A0], T0_16
1112 IEM_SAVE_FLAGS A2, %2, %3
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u16
1115
1116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1119 %1 T0_32, A1_32
1120 mov [A0], T0_32
1121 IEM_SAVE_FLAGS A2, %2, %3
1122 EPILOGUE_3_ARGS
1123ENDPROC iemAImpl_ %+ %1 %+ _u32
1124
1125 %ifdef RT_ARCH_AMD64
1126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1129 %1 T0, A1
1130 mov [A0], T0
1131 IEM_SAVE_FLAGS A2, %2, %3
1132 EPILOGUE_3_ARGS_EX 8
1133ENDPROC iemAImpl_ %+ %1 %+ _u64
1134 %endif ; RT_ARCH_AMD64
1135%endmacro
1136IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1137
1138
1139;
1140; IMUL is also a similar but yet different case (no lock, no mem dst).
1141; The rDX:rAX variant of imul is handled together with mul further down.
1142;
1143BEGINCODE
1144; @param 1 EFLAGS that are modified.
1145; @param 2 Undefined EFLAGS.
1146; @param 3 Function suffix.
1147; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1148; 2 for AMD (set AF, clear PF, ZF and SF).
1149%macro IEMIMPL_IMUL_TWO 4
1150BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1151 PROLOGUE_3_ARGS
1152 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1153 imul A1_16, word [A0]
1154 mov [A0], A1_16
1155 %if %4 != 1
1156 IEM_SAVE_FLAGS A2, %1, %2
1157 %else
1158 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1159 %endif
1160 EPILOGUE_3_ARGS
1161ENDPROC iemAImpl_imul_two_u16 %+ %3
1162
1163BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1164 PROLOGUE_3_ARGS
1165 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1166 imul A1_32, dword [A0]
1167 mov [A0], A1_32
1168 %if %4 != 1
1169 IEM_SAVE_FLAGS A2, %1, %2
1170 %else
1171 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1172 %endif
1173 EPILOGUE_3_ARGS
1174ENDPROC iemAImpl_imul_two_u32 %+ %3
1175
1176 %ifdef RT_ARCH_AMD64
1177BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1178 PROLOGUE_3_ARGS
1179 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1180 imul A1, qword [A0]
1181 mov [A0], A1
1182 %if %4 != 1
1183 IEM_SAVE_FLAGS A2, %1, %2
1184 %else
1185 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1186 %endif
1187 EPILOGUE_3_ARGS_EX 8
1188ENDPROC iemAImpl_imul_two_u64 %+ %3
1189 %endif ; RT_ARCH_AMD64
1190%endmacro
1191IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1192IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1193IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1194
1195
1196;
1197; XCHG for memory operands. This implies locking. No flag changes.
1198;
1199; Each function takes two arguments, first the pointer to the memory,
1200; then the pointer to the register. They all return void.
1201;
1202BEGINCODE
1203BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1204 PROLOGUE_2_ARGS
1205 mov T0_8, [A1]
1206 xchg [A0], T0_8
1207 mov [A1], T0_8
1208 EPILOGUE_2_ARGS
1209ENDPROC iemAImpl_xchg_u8_locked
1210
1211BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1212 PROLOGUE_2_ARGS
1213 mov T0_16, [A1]
1214 xchg [A0], T0_16
1215 mov [A1], T0_16
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u16_locked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_32, [A1]
1222 xchg [A0], T0_32
1223 mov [A1], T0_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_locked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 xchg [A0], T0
1232 mov [A1], T0
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u64_locked
1235%endif
1236
1237; Unlocked variants for fDisregardLock mode.
1238
1239BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1240 PROLOGUE_2_ARGS
1241 mov T0_8, [A1]
1242 mov T1_8, [A0]
1243 mov [A0], T0_8
1244 mov [A1], T1_8
1245 EPILOGUE_2_ARGS
1246ENDPROC iemAImpl_xchg_u8_unlocked
1247
1248BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1249 PROLOGUE_2_ARGS
1250 mov T0_16, [A1]
1251 mov T1_16, [A0]
1252 mov [A0], T0_16
1253 mov [A1], T1_16
1254 EPILOGUE_2_ARGS
1255ENDPROC iemAImpl_xchg_u16_unlocked
1256
1257BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1258 PROLOGUE_2_ARGS
1259 mov T0_32, [A1]
1260 mov T1_32, [A0]
1261 mov [A0], T0_32
1262 mov [A1], T1_32
1263 EPILOGUE_2_ARGS
1264ENDPROC iemAImpl_xchg_u32_unlocked
1265
1266%ifdef RT_ARCH_AMD64
1267BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1268 PROLOGUE_2_ARGS
1269 mov T0, [A1]
1270 mov T1, [A0]
1271 mov [A0], T0
1272 mov [A1], T1
1273 EPILOGUE_2_ARGS
1274ENDPROC iemAImpl_xchg_u64_unlocked
1275%endif
1276
1277
1278;
1279; XADD for memory operands.
1280;
1281; Each function takes three arguments, first the pointer to the
1282; memory/register, then the pointer to the register, and finally a pointer to
1283; eflags. They all return void.
1284;
1285BEGINCODE
1286BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1287 PROLOGUE_3_ARGS
1288 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1289 mov T0_8, [A1]
1290 xadd [A0], T0_8
1291 mov [A1], T0_8
1292 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1293 EPILOGUE_3_ARGS
1294ENDPROC iemAImpl_xadd_u8
1295
1296BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1297 PROLOGUE_3_ARGS
1298 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1299 mov T0_16, [A1]
1300 xadd [A0], T0_16
1301 mov [A1], T0_16
1302 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1303 EPILOGUE_3_ARGS
1304ENDPROC iemAImpl_xadd_u16
1305
1306BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1309 mov T0_32, [A1]
1310 xadd [A0], T0_32
1311 mov [A1], T0_32
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u32
1315
1316%ifdef RT_ARCH_AMD64
1317BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1320 mov T0, [A1]
1321 xadd [A0], T0
1322 mov [A1], T0
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u64
1326%endif ; RT_ARCH_AMD64
1327
1328BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1329 PROLOGUE_3_ARGS
1330 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1331 mov T0_8, [A1]
1332 lock xadd [A0], T0_8
1333 mov [A1], T0_8
1334 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_xadd_u8_locked
1337
1338BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1341 mov T0_16, [A1]
1342 lock xadd [A0], T0_16
1343 mov [A1], T0_16
1344 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1345 EPILOGUE_3_ARGS
1346ENDPROC iemAImpl_xadd_u16_locked
1347
1348BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1351 mov T0_32, [A1]
1352 lock xadd [A0], T0_32
1353 mov [A1], T0_32
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u32_locked
1357
1358%ifdef RT_ARCH_AMD64
1359BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1362 mov T0, [A1]
1363 lock xadd [A0], T0
1364 mov [A1], T0
1365 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1366 EPILOGUE_3_ARGS
1367ENDPROC iemAImpl_xadd_u64_locked
1368%endif ; RT_ARCH_AMD64
1369
1370
1371;
1372; CMPXCHG8B.
1373;
1374; These are tricky register wise, so the code is duplicated for each calling
1375; convention.
1376;
1377; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1378;
1379; C-proto:
1380; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1381; uint32_t *pEFlags));
1382;
1383; Note! Identical to iemAImpl_cmpxchg16b.
1384;
1385BEGINCODE
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1387%ifdef RT_ARCH_AMD64
1388 %ifdef ASM_CALL64_MSC
1389 push rbx
1390
1391 mov r11, rdx ; pu64EaxEdx (is also T1)
1392 mov r10, rcx ; pu64Dst
1393
1394 mov ebx, [r8]
1395 mov ecx, [r8 + 4]
1396 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1397 mov eax, [r11]
1398 mov edx, [r11 + 4]
1399
1400 cmpxchg8b [r10]
1401
1402 mov [r11], eax
1403 mov [r11 + 4], edx
1404 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1405
1406 pop rbx
1407 ret
1408 %else
1409 push rbx
1410
1411 mov r10, rcx ; pEFlags
1412 mov r11, rdx ; pu64EbxEcx (is also T1)
1413
1414 mov ebx, [r11]
1415 mov ecx, [r11 + 4]
1416 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1417 mov eax, [rsi]
1418 mov edx, [rsi + 4]
1419
1420 cmpxchg8b [rdi]
1421
1422 mov [rsi], eax
1423 mov [rsi + 4], edx
1424 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1425
1426 pop rbx
1427 ret
1428
1429 %endif
1430%else
1431 push esi
1432 push edi
1433 push ebx
1434 push ebp
1435
1436 mov edi, ecx ; pu64Dst
1437 mov esi, edx ; pu64EaxEdx
1438 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1439 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1440
1441 mov ebx, [ecx]
1442 mov ecx, [ecx + 4]
1443 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1444 mov eax, [esi]
1445 mov edx, [esi + 4]
1446
1447 cmpxchg8b [edi]
1448
1449 mov [esi], eax
1450 mov [esi + 4], edx
1451 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1452
1453 pop ebp
1454 pop ebx
1455 pop edi
1456 pop esi
1457 ret 8
1458%endif
1459ENDPROC iemAImpl_cmpxchg8b
1460
1461BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1462%ifdef RT_ARCH_AMD64
1463 %ifdef ASM_CALL64_MSC
1464 push rbx
1465
1466 mov r11, rdx ; pu64EaxEdx (is also T1)
1467 mov r10, rcx ; pu64Dst
1468
1469 mov ebx, [r8]
1470 mov ecx, [r8 + 4]
1471 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1472 mov eax, [r11]
1473 mov edx, [r11 + 4]
1474
1475 lock cmpxchg8b [r10]
1476
1477 mov [r11], eax
1478 mov [r11 + 4], edx
1479 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1480
1481 pop rbx
1482 ret
1483 %else
1484 push rbx
1485
1486 mov r10, rcx ; pEFlags
1487 mov r11, rdx ; pu64EbxEcx (is also T1)
1488
1489 mov ebx, [r11]
1490 mov ecx, [r11 + 4]
1491 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1492 mov eax, [rsi]
1493 mov edx, [rsi + 4]
1494
1495 lock cmpxchg8b [rdi]
1496
1497 mov [rsi], eax
1498 mov [rsi + 4], edx
1499 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1500
1501 pop rbx
1502 ret
1503
1504 %endif
1505%else
1506 push esi
1507 push edi
1508 push ebx
1509 push ebp
1510
1511 mov edi, ecx ; pu64Dst
1512 mov esi, edx ; pu64EaxEdx
1513 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1514 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1515
1516 mov ebx, [ecx]
1517 mov ecx, [ecx + 4]
1518 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1519 mov eax, [esi]
1520 mov edx, [esi + 4]
1521
1522 lock cmpxchg8b [edi]
1523
1524 mov [esi], eax
1525 mov [esi + 4], edx
1526 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1527
1528 pop ebp
1529 pop ebx
1530 pop edi
1531 pop esi
1532 ret 8
1533%endif
1534ENDPROC iemAImpl_cmpxchg8b_locked
1535
1536%ifdef RT_ARCH_AMD64
1537
1538;
1539; CMPXCHG16B.
1540;
1541; These are tricky register wise, so the code is duplicated for each calling
1542; convention.
1543;
1544; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1545;
1546; C-proto:
1547; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1548; uint32_t *pEFlags));
1549;
1550; Note! Identical to iemAImpl_cmpxchg8b.
1551;
1552BEGINCODE
1553BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1554 %ifdef ASM_CALL64_MSC
1555 push rbx
1556
1557 mov r11, rdx ; pu64RaxRdx (is also T1)
1558 mov r10, rcx ; pu64Dst
1559
1560 mov rbx, [r8]
1561 mov rcx, [r8 + 8]
1562 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1563 mov rax, [r11]
1564 mov rdx, [r11 + 8]
1565
1566 cmpxchg16b [r10]
1567
1568 mov [r11], rax
1569 mov [r11 + 8], rdx
1570 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1571
1572 pop rbx
1573 ret
1574 %else
1575 push rbx
1576
1577 mov r10, rcx ; pEFlags
1578 mov r11, rdx ; pu64RbxRcx (is also T1)
1579
1580 mov rbx, [r11]
1581 mov rcx, [r11 + 8]
1582 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1583 mov rax, [rsi]
1584 mov rdx, [rsi + 8]
1585
1586 cmpxchg16b [rdi]
1587
1588 mov [rsi], rax
1589 mov [rsi + 8], rdx
1590 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1591
1592 pop rbx
1593 ret
1594
1595 %endif
1596ENDPROC iemAImpl_cmpxchg16b
1597
1598BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1599 %ifdef ASM_CALL64_MSC
1600 push rbx
1601
1602 mov r11, rdx ; pu64RaxRdx (is also T1)
1603 mov r10, rcx ; pu64Dst
1604
1605 mov rbx, [r8]
1606 mov rcx, [r8 + 8]
1607 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1608 mov rax, [r11]
1609 mov rdx, [r11 + 8]
1610
1611 lock cmpxchg16b [r10]
1612
1613 mov [r11], rax
1614 mov [r11 + 8], rdx
1615 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1616
1617 pop rbx
1618 ret
1619 %else
1620 push rbx
1621
1622 mov r10, rcx ; pEFlags
1623 mov r11, rdx ; pu64RbxRcx (is also T1)
1624
1625 mov rbx, [r11]
1626 mov rcx, [r11 + 8]
1627 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1628 mov rax, [rsi]
1629 mov rdx, [rsi + 8]
1630
1631 lock cmpxchg16b [rdi]
1632
1633 mov [rsi], rax
1634 mov [rsi + 8], rdx
1635 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1636
1637 pop rbx
1638 ret
1639
1640 %endif
1641ENDPROC iemAImpl_cmpxchg16b_locked
1642
1643%endif ; RT_ARCH_AMD64
1644
1645
1646;
1647; CMPXCHG.
1648;
1649; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1650;
1651; C-proto:
1652; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1653;
1654BEGINCODE
1655%macro IEMIMPL_CMPXCHG 2
1656BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1657 PROLOGUE_4_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1659 mov al, [A1]
1660 %1 cmpxchg [A0], A2_8
1661 mov [A1], al
1662 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1663 EPILOGUE_4_ARGS
1664ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1665
1666BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1667 PROLOGUE_4_ARGS
1668 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1669 mov ax, [A1]
1670 %1 cmpxchg [A0], A2_16
1671 mov [A1], ax
1672 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1673 EPILOGUE_4_ARGS
1674ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1675
1676BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1677 PROLOGUE_4_ARGS
1678 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1679 mov eax, [A1]
1680 %1 cmpxchg [A0], A2_32
1681 mov [A1], eax
1682 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1683 EPILOGUE_4_ARGS
1684ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1685
1686BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1687%ifdef RT_ARCH_AMD64
1688 PROLOGUE_4_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1690 mov rax, [A1]
1691 %1 cmpxchg [A0], A2
1692 mov [A1], rax
1693 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1694 EPILOGUE_4_ARGS
1695%else
1696 ;
1697 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1698 ;
1699 push esi
1700 push edi
1701 push ebx
1702 push ebp
1703
1704 mov edi, ecx ; pu64Dst
1705 mov esi, edx ; pu64Rax
1706 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1707 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1708
1709 mov ebx, [ecx]
1710 mov ecx, [ecx + 4]
1711 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1712 mov eax, [esi]
1713 mov edx, [esi + 4]
1714
1715 lock cmpxchg8b [edi]
1716
1717 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1718 jz .cmpxchg8b_not_equal
1719;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1720 cmp eax, eax ; just set the other flags.
1721.store:
1722 mov [esi], eax
1723 mov [esi + 4], edx
1724 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1725
1726 pop ebp
1727 pop ebx
1728 pop edi
1729 pop esi
1730 ret 8
1731
1732.cmpxchg8b_not_equal:
1733 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1734 jne .store
1735 cmp [esi], eax
1736 jmp .store
1737
1738%endif
1739ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1740%endmacro ; IEMIMPL_CMPXCHG
1741
1742IEMIMPL_CMPXCHG , ,
1743IEMIMPL_CMPXCHG lock, _locked
1744
1745;;
1746; Macro for implementing a unary operator.
1747;
1748; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1749; variants, except on 32-bit system where the 64-bit accesses requires hand
1750; coding.
1751;
1752; All the functions takes a pointer to the destination memory operand in A0,
1753; the source register operand in A1 and a pointer to eflags in A2.
1754;
1755; @param 1 The instruction mnemonic.
1756; @param 2 The modified flags.
1757; @param 3 The undefined flags.
1758;
1759%macro IEMIMPL_UNARY_OP 3
1760BEGINCODE
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1762 PROLOGUE_2_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1764 %1 byte [A0]
1765 IEM_SAVE_FLAGS A1, %2, %3
1766 EPILOGUE_2_ARGS
1767ENDPROC iemAImpl_ %+ %1 %+ _u8
1768
1769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1770 PROLOGUE_2_ARGS
1771 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1772 lock %1 byte [A0]
1773 IEM_SAVE_FLAGS A1, %2, %3
1774 EPILOGUE_2_ARGS
1775ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1776
1777BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1778 PROLOGUE_2_ARGS
1779 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1780 %1 word [A0]
1781 IEM_SAVE_FLAGS A1, %2, %3
1782 EPILOGUE_2_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1786 PROLOGUE_2_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1788 lock %1 word [A0]
1789 IEM_SAVE_FLAGS A1, %2, %3
1790 EPILOGUE_2_ARGS
1791ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1792
1793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1794 PROLOGUE_2_ARGS
1795 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1796 %1 dword [A0]
1797 IEM_SAVE_FLAGS A1, %2, %3
1798 EPILOGUE_2_ARGS
1799ENDPROC iemAImpl_ %+ %1 %+ _u32
1800
1801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1802 PROLOGUE_2_ARGS
1803 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1804 lock %1 dword [A0]
1805 IEM_SAVE_FLAGS A1, %2, %3
1806 EPILOGUE_2_ARGS
1807ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1808
1809 %ifdef RT_ARCH_AMD64
1810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1811 PROLOGUE_2_ARGS
1812 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1813 %1 qword [A0]
1814 IEM_SAVE_FLAGS A1, %2, %3
1815 EPILOGUE_2_ARGS
1816ENDPROC iemAImpl_ %+ %1 %+ _u64
1817
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1819 PROLOGUE_2_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1821 lock %1 qword [A0]
1822 IEM_SAVE_FLAGS A1, %2, %3
1823 EPILOGUE_2_ARGS
1824ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1825 %endif ; RT_ARCH_AMD64
1826
1827%endmacro
1828
1829IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1830IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1831IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1832IEMIMPL_UNARY_OP not, 0, 0
1833
1834
1835;
1836; BSWAP. No flag changes.
1837;
1838; Each function takes one argument, pointer to the value to bswap
1839; (input/output). They all return void.
1840;
1841BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1842 PROLOGUE_1_ARGS
1843 mov T0_32, [A0] ; just in case any of the upper bits are used.
1844 db 66h
1845 bswap T0_32
1846 mov [A0], T0_32
1847 EPILOGUE_1_ARGS
1848ENDPROC iemAImpl_bswap_u16
1849
1850BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1851 PROLOGUE_1_ARGS
1852 mov T0_32, [A0]
1853 bswap T0_32
1854 mov [A0], T0_32
1855 EPILOGUE_1_ARGS
1856ENDPROC iemAImpl_bswap_u32
1857
1858BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1859%ifdef RT_ARCH_AMD64
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 bswap T0
1863 mov [A0], T0
1864 EPILOGUE_1_ARGS
1865%else
1866 PROLOGUE_1_ARGS
1867 mov T0, [A0]
1868 mov T1, [A0 + 4]
1869 bswap T0
1870 bswap T1
1871 mov [A0 + 4], T0
1872 mov [A0], T1
1873 EPILOGUE_1_ARGS
1874%endif
1875ENDPROC iemAImpl_bswap_u64
1876
1877
1878;;
1879; Macro for implementing a shift operation.
1880;
1881; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1882; 32-bit system where the 64-bit accesses requires hand coding.
1883;
1884; All the functions takes a pointer to the destination memory operand in A0,
1885; the shift count in A1 and a pointer to eflags in A2.
1886;
1887; @param 1 The instruction mnemonic.
1888; @param 2 The modified flags.
1889; @param 3 The undefined flags.
1890; @param 4 Force load flags.
1891;
1892; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1893;
1894; @note the _intel and _amd variants are implemented in C.
1895;
1896%macro IEMIMPL_SHIFT_OP 4
1897BEGINCODE
1898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1899 PROLOGUE_3_ARGS
1900 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1901 %ifdef ASM_CALL64_GCC
1902 mov cl, A1_8
1903 %1 byte [A0], cl
1904 %else
1905 xchg A1, A0
1906 %1 byte [A1], cl
1907 %endif
1908 IEM_SAVE_FLAGS A2, %2, %3
1909 EPILOGUE_3_ARGS
1910ENDPROC iemAImpl_ %+ %1 %+ _u8
1911
1912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1913 PROLOGUE_3_ARGS
1914 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1915 %ifdef ASM_CALL64_GCC
1916 mov cl, A1_8
1917 %1 word [A0], cl
1918 %else
1919 xchg A1, A0
1920 %1 word [A1], cl
1921 %endif
1922 IEM_SAVE_FLAGS A2, %2, %3
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u16
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1927 PROLOGUE_3_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1929 %ifdef ASM_CALL64_GCC
1930 mov cl, A1_8
1931 %1 dword [A0], cl
1932 %else
1933 xchg A1, A0
1934 %1 dword [A1], cl
1935 %endif
1936 IEM_SAVE_FLAGS A2, %2, %3
1937 EPILOGUE_3_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u32
1939
1940 %ifdef RT_ARCH_AMD64
1941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1942 PROLOGUE_3_ARGS
1943 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1944 %ifdef ASM_CALL64_GCC
1945 mov cl, A1_8
1946 %1 qword [A0], cl
1947 %else
1948 xchg A1, A0
1949 %1 qword [A1], cl
1950 %endif
1951 IEM_SAVE_FLAGS A2, %2, %3
1952 EPILOGUE_3_ARGS
1953ENDPROC iemAImpl_ %+ %1 %+ _u64
1954 %endif ; RT_ARCH_AMD64
1955
1956%endmacro
1957
1958;; @todo some questions wrt flags when the shift count is high according to intel docs...
1959IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1960IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1961IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1962IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1963IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1964IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1965IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1966
1967
1968;;
1969; Macro for implementing a double precision shift operation.
1970;
1971; This will generate code for the 16, 32 and 64 bit accesses, except on
1972; 32-bit system where the 64-bit accesses requires hand coding.
1973;
1974; The functions takes the destination operand (r/m) in A0, the source (reg) in
1975; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1976;
1977; @param 1 The instruction mnemonic.
1978; @param 2 The modified flags.
1979; @param 3 The undefined flags.
1980;
1981; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1982;
1983; @note the _intel and _amd variants are implemented in C.
1984;
1985%macro IEMIMPL_SHIFT_DBL_OP 3
1986BEGINCODE
1987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1988 PROLOGUE_4_ARGS
1989 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1990 %ifdef ASM_CALL64_GCC
1991 xchg A3, A2
1992 %1 [A0], A1_16, cl
1993 xchg A3, A2
1994 %else
1995 xchg A0, A2
1996 %1 [A2], A1_16, cl
1997 %endif
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 EPILOGUE_4_ARGS
2000ENDPROC iemAImpl_ %+ %1 %+ _u16
2001
2002BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2003 PROLOGUE_4_ARGS
2004 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2005 %ifdef ASM_CALL64_GCC
2006 xchg A3, A2
2007 %1 [A0], A1_32, cl
2008 xchg A3, A2
2009 %else
2010 xchg A0, A2
2011 %1 [A2], A1_32, cl
2012 %endif
2013 IEM_SAVE_FLAGS A3, %2, %3
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32
2016
2017 %ifdef RT_ARCH_AMD64
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 %ifdef ASM_CALL64_GCC
2022 xchg A3, A2
2023 %1 [A0], A1, cl
2024 xchg A3, A2
2025 %else
2026 xchg A0, A2
2027 %1 [A2], A1, cl
2028 %endif
2029 IEM_SAVE_FLAGS A3, %2, %3
2030 EPILOGUE_4_ARGS_EX 12
2031ENDPROC iemAImpl_ %+ %1 %+ _u64
2032 %endif ; RT_ARCH_AMD64
2033
2034%endmacro
2035
2036IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2037IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2038
2039
2040;;
2041; Macro for implementing a multiplication operations.
2042;
2043; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2044; 32-bit system where the 64-bit accesses requires hand coding.
2045;
2046; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2047; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2048; pointer to eflags in A3.
2049;
2050; The functions all return 0 so the caller can be used for div/idiv as well as
2051; for the mul/imul implementation.
2052;
2053; @param 1 The instruction mnemonic.
2054; @param 2 The modified flags.
2055; @param 3 The undefined flags.
2056; @param 4 Name suffix.
2057; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2058;
2059; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2060;
2061%macro IEMIMPL_MUL_OP 5
2062BEGINCODE
2063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2064 PROLOGUE_3_ARGS
2065 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2066 mov al, [A0]
2067 %1 A1_8
2068 mov [A0], ax
2069 %if %5 != 1
2070 IEM_SAVE_FLAGS A2, %2, %3
2071 %else
2072 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2073 %endif
2074 xor eax, eax
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2077
2078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2079 PROLOGUE_4_ARGS
2080 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2081 mov ax, [A0]
2082 %ifdef ASM_CALL64_GCC
2083 %1 A2_16
2084 mov [A0], ax
2085 mov [A1], dx
2086 %else
2087 mov T1, A1
2088 %1 A2_16
2089 mov [A0], ax
2090 mov [T1], dx
2091 %endif
2092 %if %5 != 1
2093 IEM_SAVE_FLAGS A3, %2, %3
2094 %else
2095 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2096 %endif
2097 xor eax, eax
2098 EPILOGUE_4_ARGS
2099ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2100
2101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2102 PROLOGUE_4_ARGS
2103 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2104 mov eax, [A0]
2105 %ifdef ASM_CALL64_GCC
2106 %1 A2_32
2107 mov [A0], eax
2108 mov [A1], edx
2109 %else
2110 mov T1, A1
2111 %1 A2_32
2112 mov [A0], eax
2113 mov [T1], edx
2114 %endif
2115 %if %5 != 1
2116 IEM_SAVE_FLAGS A3, %2, %3
2117 %else
2118 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2119 %endif
2120 xor eax, eax
2121 EPILOGUE_4_ARGS
2122ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2123
2124 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2128 mov rax, [A0]
2129 %ifdef ASM_CALL64_GCC
2130 %1 A2
2131 mov [A0], rax
2132 mov [A1], rdx
2133 %else
2134 mov T1, A1
2135 %1 A2
2136 mov [A0], rax
2137 mov [T1], rdx
2138 %endif
2139 %if %5 != 1
2140 IEM_SAVE_FLAGS A3, %2, %3
2141 %else
2142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2143 %endif
2144 xor eax, eax
2145 EPILOGUE_4_ARGS_EX 12
2146ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2147 %endif ; !RT_ARCH_AMD64
2148
2149%endmacro
2150
2151IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2152IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2153IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2154IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157
2158
2159BEGINCODE
2160;;
2161; Worker function for negating a 32-bit number in T1:T0
2162; @uses None (T0,T1)
2163BEGINPROC iemAImpl_negate_T0_T1_u32
2164 push 0
2165 push 0
2166 xchg T0_32, [xSP]
2167 xchg T1_32, [xSP + xCB]
2168 sub T0_32, [xSP]
2169 sbb T1_32, [xSP + xCB]
2170 add xSP, xCB*2
2171 ret
2172ENDPROC iemAImpl_negate_T0_T1_u32
2173
2174%ifdef RT_ARCH_AMD64
2175;;
2176; Worker function for negating a 64-bit number in T1:T0
2177; @uses None (T0,T1)
2178BEGINPROC iemAImpl_negate_T0_T1_u64
2179 push 0
2180 push 0
2181 xchg T0, [xSP]
2182 xchg T1, [xSP + xCB]
2183 sub T0, [xSP]
2184 sbb T1, [xSP + xCB]
2185 add xSP, xCB*2
2186 ret
2187ENDPROC iemAImpl_negate_T0_T1_u64
2188%endif
2189
2190
2191;;
2192; Macro for implementing a division operations.
2193;
2194; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2195; 32-bit system where the 64-bit accesses requires hand coding.
2196;
2197; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2198; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2199; pointer to eflags in A3.
2200;
2201; The functions all return 0 on success and -1 if a divide error should be
2202; raised by the caller.
2203;
2204; @param 1 The instruction mnemonic.
2205; @param 2 The modified flags.
2206; @param 3 The undefined flags.
2207; @param 4 1 if signed, 0 if unsigned.
2208; @param 5 Function suffix.
2209; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2210; 2 for AMD (set AF, clear PF, ZF and SF).
2211;
2212; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2213;
2214%macro IEMIMPL_DIV_OP 6
2215BEGINCODE
2216BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2217 PROLOGUE_3_ARGS
2218
2219 ; div by chainsaw check.
2220 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2221 jz .div_zero
2222
2223 ; Overflow check - unsigned division is simple to verify, haven't
2224 ; found a simple way to check signed division yet unfortunately.
2225 %if %4 == 0
2226 cmp [A0 + 1], A1_8
2227 jae .div_overflow
2228 %else
2229 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2230 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2231 test A1_8, A1_8
2232 js .divisor_negative
2233 test T0_16, T0_16
2234 jns .both_positive
2235 neg T0_16
2236.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2237 push T0 ; Start off like unsigned below.
2238 shr T0_16, 7
2239 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2240 pop T0
2241 jb .div_no_overflow
2242 ja .div_overflow
2243 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2244 cmp T0_8, A1_8
2245 jae .div_overflow
2246 jmp .div_no_overflow
2247
2248.divisor_negative:
2249 neg A1_8
2250 test T0_16, T0_16
2251 jns .one_of_each
2252 neg T0_16
2253.both_positive: ; Same as unsigned shifted by sign indicator bit.
2254 shr T0_16, 7
2255 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2256 jae .div_overflow
2257.div_no_overflow:
2258 mov A1, T1 ; restore divisor
2259 %endif
2260
2261 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2262 mov ax, [A0]
2263 %1 A1_8
2264 mov [A0], ax
2265 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2266 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2267 %else
2268 IEM_SAVE_FLAGS A2, %2, %3
2269 %endif
2270 xor eax, eax
2271
2272.return:
2273 EPILOGUE_3_ARGS
2274
2275.div_zero:
2276.div_overflow:
2277 mov eax, -1
2278 jmp .return
2279ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2280
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2282 PROLOGUE_4_ARGS
2283
2284 ; div by chainsaw check.
2285 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2286 jz .div_zero
2287
2288 ; Overflow check - unsigned division is simple to verify, haven't
2289 ; found a simple way to check signed division yet unfortunately.
2290 %if %4 == 0
2291 cmp [A1], A2_16
2292 jae .div_overflow
2293 %else
2294 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2295 shl T0_32, 16
2296 mov T0_16, [A0] ; T0 = dividend
2297 mov T1, A2 ; T1 = divisor
2298 test T1_16, T1_16
2299 js .divisor_negative
2300 test T0_32, T0_32
2301 jns .both_positive
2302 neg T0_32
2303.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2304 push T0 ; Start off like unsigned below.
2305 shr T0_32, 15
2306 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2307 pop T0
2308 jb .div_no_overflow
2309 ja .div_overflow
2310 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2311 cmp T0_16, T1_16
2312 jae .div_overflow
2313 jmp .div_no_overflow
2314
2315.divisor_negative:
2316 neg T1_16
2317 test T0_32, T0_32
2318 jns .one_of_each
2319 neg T0_32
2320.both_positive: ; Same as unsigned shifted by sign indicator bit.
2321 shr T0_32, 15
2322 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2323 jae .div_overflow
2324.div_no_overflow:
2325 %endif
2326
2327 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2328 %ifdef ASM_CALL64_GCC
2329 mov T1, A2
2330 mov ax, [A0]
2331 mov dx, [A1]
2332 %1 T1_16
2333 mov [A0], ax
2334 mov [A1], dx
2335 %else
2336 mov T1, A1
2337 mov ax, [A0]
2338 mov dx, [T1]
2339 %1 A2_16
2340 mov [A0], ax
2341 mov [T1], dx
2342 %endif
2343 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2344 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2345 %else
2346 IEM_SAVE_FLAGS A3, %2, %3
2347 %endif
2348 xor eax, eax
2349
2350.return:
2351 EPILOGUE_4_ARGS
2352
2353.div_zero:
2354.div_overflow:
2355 mov eax, -1
2356 jmp .return
2357ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2358
2359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2360 PROLOGUE_4_ARGS
2361
2362 ; div by chainsaw check.
2363 test A2_32, A2_32
2364 jz .div_zero
2365
2366 ; Overflow check - unsigned division is simple to verify, haven't
2367 ; found a simple way to check signed division yet unfortunately.
2368 %if %4 == 0
2369 cmp [A1], A2_32
2370 jae .div_overflow
2371 %else
2372 push A2 ; save A2 so we modify it (we out of regs on x86).
2373 mov T0_32, [A0] ; T0 = dividend low
2374 mov T1_32, [A1] ; T1 = dividend high
2375 ;test A2_32, A2_32 - we did this 5 instructions ago.
2376 js .divisor_negative
2377 test T1_32, T1_32
2378 jns .both_positive
2379 call NAME(iemAImpl_negate_T0_T1_u32)
2380.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2381 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2382 jnz .div_overflow
2383 push T0 ; Start off like unsigned below.
2384 shl T1_32, 1
2385 shr T0_32, 31
2386 or T1_32, T0_32
2387 cmp T1_32, A2_32
2388 pop T0
2389 jb .div_no_overflow
2390 ja .div_overflow
2391 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2392 cmp T0_32, A2_32
2393 jae .div_overflow
2394 jmp .div_no_overflow
2395
2396.divisor_negative:
2397 neg A2_32
2398 test T1_32, T1_32
2399 jns .one_of_each
2400 call NAME(iemAImpl_negate_T0_T1_u32)
2401.both_positive: ; Same as unsigned shifted by sign indicator bit.
2402 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2403 jnz .div_overflow
2404 shl T1_32, 1
2405 shr T0_32, 31
2406 or T1_32, T0_32
2407 cmp T1_32, A2_32
2408 jae .div_overflow
2409.div_no_overflow:
2410 pop A2
2411 %endif
2412
2413 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2414 mov eax, [A0]
2415 %ifdef ASM_CALL64_GCC
2416 mov T1, A2
2417 mov eax, [A0]
2418 mov edx, [A1]
2419 %1 T1_32
2420 mov [A0], eax
2421 mov [A1], edx
2422 %else
2423 mov T1, A1
2424 mov eax, [A0]
2425 mov edx, [T1]
2426 %1 A2_32
2427 mov [A0], eax
2428 mov [T1], edx
2429 %endif
2430 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2431 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2432 %else
2433 IEM_SAVE_FLAGS A3, %2, %3
2434 %endif
2435 xor eax, eax
2436
2437.return:
2438 EPILOGUE_4_ARGS
2439
2440.div_overflow:
2441 %if %4 != 0
2442 pop A2
2443 %endif
2444.div_zero:
2445 mov eax, -1
2446 jmp .return
2447ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2448
2449 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2450BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2451 PROLOGUE_4_ARGS
2452
2453 test A2, A2
2454 jz .div_zero
2455 %if %4 == 0
2456 cmp [A1], A2
2457 jae .div_overflow
2458 %else
2459 push A2 ; save A2 so we modify it (we out of regs on x86).
2460 mov T0, [A0] ; T0 = dividend low
2461 mov T1, [A1] ; T1 = dividend high
2462 ;test A2, A2 - we did this five instructions above.
2463 js .divisor_negative
2464 test T1, T1
2465 jns .both_positive
2466 call NAME(iemAImpl_negate_T0_T1_u64)
2467.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2468 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2469 jc .div_overflow
2470 push T0 ; Start off like unsigned below.
2471 shl T1, 1
2472 shr T0, 63
2473 or T1, T0
2474 cmp T1, A2
2475 pop T0
2476 jb .div_no_overflow
2477 ja .div_overflow
2478 mov T1, 0x7fffffffffffffff
2479 and T0, T1 ; Special case for covering (divisor - 1).
2480 cmp T0, A2
2481 jae .div_overflow
2482 jmp .div_no_overflow
2483
2484.divisor_negative:
2485 neg A2
2486 test T1, T1
2487 jns .one_of_each
2488 call NAME(iemAImpl_negate_T0_T1_u64)
2489.both_positive: ; Same as unsigned shifted by sign indicator bit.
2490 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2491 jc .div_overflow
2492 shl T1, 1
2493 shr T0, 63
2494 or T1, T0
2495 cmp T1, A2
2496 jae .div_overflow
2497.div_no_overflow:
2498 pop A2
2499 %endif
2500
2501 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2502 mov rax, [A0]
2503 %ifdef ASM_CALL64_GCC
2504 mov T1, A2
2505 mov rax, [A0]
2506 mov rdx, [A1]
2507 %1 T1
2508 mov [A0], rax
2509 mov [A1], rdx
2510 %else
2511 mov T1, A1
2512 mov rax, [A0]
2513 mov rdx, [T1]
2514 %1 A2
2515 mov [A0], rax
2516 mov [T1], rdx
2517 %endif
2518 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2519 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2520 %else
2521 IEM_SAVE_FLAGS A3, %2, %3
2522 %endif
2523 xor eax, eax
2524
2525.return:
2526 EPILOGUE_4_ARGS_EX 12
2527
2528.div_overflow:
2529 %if %4 != 0
2530 pop A2
2531 %endif
2532.div_zero:
2533 mov eax, -1
2534 jmp .return
2535ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2536 %endif ; !RT_ARCH_AMD64
2537
2538%endmacro
2539
2540IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2541IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2542IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2543;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
2544IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2545IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2546IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2547
2548
2549;;
2550; Macro for implementing memory fence operation.
2551;
2552; No return value, no operands or anything.
2553;
2554; @param 1 The instruction.
2555;
2556%macro IEMIMPL_MEM_FENCE 1
2557BEGINCODE
2558BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2559 %1
2560 ret
2561ENDPROC iemAImpl_ %+ %1
2562%endmacro
2563
2564IEMIMPL_MEM_FENCE lfence
2565IEMIMPL_MEM_FENCE sfence
2566IEMIMPL_MEM_FENCE mfence
2567
2568;;
2569; Alternative for non-SSE2 host.
2570;
2571BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2572 push xAX
2573 xchg xAX, [xSP]
2574 add xSP, xCB
2575 ret
2576ENDPROC iemAImpl_alt_mem_fence
2577
2578
2579;;
2580; Initialize the FPU for the actual instruction being emulated, this means
2581; loading parts of the guest's control word and status word.
2582;
2583; @uses 24 bytes of stack. T0, T1
2584; @param 1 Expression giving the address of the FXSTATE of the guest.
2585;
2586%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2587 fnstenv [xSP]
2588
2589 ; FCW - for exception, precision and rounding control.
2590 movzx T0, word [%1 + X86FXSTATE.FCW]
2591 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2592 mov [xSP + X86FSTENV32P.FCW], T0_16
2593
2594 ; FSW - for undefined C0, C1, C2, and C3.
2595 movzx T1, word [%1 + X86FXSTATE.FSW]
2596 and T1, X86_FSW_C_MASK
2597 movzx T0, word [xSP + X86FSTENV32P.FSW]
2598 and T0, X86_FSW_TOP_MASK
2599 or T0, T1
2600 mov [xSP + X86FSTENV32P.FSW], T0_16
2601
2602 fldenv [xSP]
2603%endmacro
2604
2605
2606;;
2607; Initialize the FPU for the actual instruction being emulated, this means
2608; loading parts of the guest's control word, status word, and update the
2609; tag word for the top register if it's empty.
2610;
2611; ASSUMES actual TOP=7
2612;
2613; @uses 24 bytes of stack. T0, T1
2614; @param 1 Expression giving the address of the FXSTATE of the guest.
2615;
2616%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2617 fnstenv [xSP]
2618
2619 ; FCW - for exception, precision and rounding control.
2620 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2621 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2622 mov [xSP + X86FSTENV32P.FCW], T0_16
2623
2624 ; FSW - for undefined C0, C1, C2, and C3.
2625 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2626 and T1_32, X86_FSW_C_MASK
2627 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2628 and T0_32, X86_FSW_TOP_MASK
2629 or T0_32, T1_32
2630 mov [xSP + X86FSTENV32P.FSW], T0_16
2631
2632 ; FTW - Only for ST0 (in/out).
2633 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2634 shr T1_32, X86_FSW_TOP_SHIFT
2635 and T1_32, X86_FSW_TOP_SMASK
2636 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2637 jc %%st0_not_empty
2638 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2639%%st0_not_empty:
2640
2641 fldenv [xSP]
2642%endmacro
2643
2644
2645;;
2646; Need to move this as well somewhere better?
2647;
2648struc IEMFPURESULT
2649 .r80Result resw 5
2650 .FSW resw 1
2651endstruc
2652
2653
2654;;
2655; Need to move this as well somewhere better?
2656;
2657struc IEMFPURESULTTWO
2658 .r80Result1 resw 5
2659 .FSW resw 1
2660 .r80Result2 resw 5
2661endstruc
2662
2663
2664;
2665;---------------------- 16-bit signed integer operations ----------------------
2666;
2667
2668
2669;;
2670; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2671;
2672; @param A0 FPU context (fxsave).
2673; @param A1 Pointer to a IEMFPURESULT for the output.
2674; @param A2 Pointer to the 16-bit floating point value to convert.
2675;
2676BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2677 PROLOGUE_3_ARGS
2678 sub xSP, 20h
2679
2680 fninit
2681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2682 fild word [A2]
2683
2684 fnstsw word [A1 + IEMFPURESULT.FSW]
2685 fnclex
2686 fstp tword [A1 + IEMFPURESULT.r80Result]
2687
2688 fninit
2689 add xSP, 20h
2690 EPILOGUE_3_ARGS
2691ENDPROC iemAImpl_fild_r80_from_i16
2692
2693
2694;;
2695; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2696;
2697; @param A0 FPU context (fxsave).
2698; @param A1 Where to return the output FSW.
2699; @param A2 Where to store the 16-bit signed integer value.
2700; @param A3 Pointer to the 80-bit value.
2701;
2702BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2703 PROLOGUE_4_ARGS
2704 sub xSP, 20h
2705
2706 fninit
2707 fld tword [A3]
2708 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2709 fistp word [A2]
2710
2711 fnstsw word [A1]
2712
2713 fninit
2714 add xSP, 20h
2715 EPILOGUE_4_ARGS
2716ENDPROC iemAImpl_fist_r80_to_i16
2717
2718
2719;;
2720; Store a 80-bit floating point value (register) as a 16-bit signed integer
2721; (memory) with truncation.
2722;
2723; @param A0 FPU context (fxsave).
2724; @param A1 Where to return the output FSW.
2725; @param A2 Where to store the 16-bit signed integer value.
2726; @param A3 Pointer to the 80-bit value.
2727;
2728BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2729 PROLOGUE_4_ARGS
2730 sub xSP, 20h
2731
2732 fninit
2733 fld tword [A3]
2734 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2735 fisttp word [A2]
2736
2737 fnstsw word [A1]
2738
2739 fninit
2740 add xSP, 20h
2741 EPILOGUE_4_ARGS
2742ENDPROC iemAImpl_fistt_r80_to_i16
2743
2744
2745;;
2746; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2747;
2748; @param 1 The instruction
2749;
2750; @param A0 FPU context (fxsave).
2751; @param A1 Pointer to a IEMFPURESULT for the output.
2752; @param A2 Pointer to the 80-bit value.
2753; @param A3 Pointer to the 16-bit value.
2754;
2755%macro IEMIMPL_FPU_R80_BY_I16 1
2756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2757 PROLOGUE_4_ARGS
2758 sub xSP, 20h
2759
2760 fninit
2761 fld tword [A2]
2762 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2763 %1 word [A3]
2764
2765 fnstsw word [A1 + IEMFPURESULT.FSW]
2766 fnclex
2767 fstp tword [A1 + IEMFPURESULT.r80Result]
2768
2769 fninit
2770 add xSP, 20h
2771 EPILOGUE_4_ARGS
2772ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2773%endmacro
2774
2775IEMIMPL_FPU_R80_BY_I16 fiadd
2776IEMIMPL_FPU_R80_BY_I16 fimul
2777IEMIMPL_FPU_R80_BY_I16 fisub
2778IEMIMPL_FPU_R80_BY_I16 fisubr
2779IEMIMPL_FPU_R80_BY_I16 fidiv
2780IEMIMPL_FPU_R80_BY_I16 fidivr
2781
2782
2783;;
2784; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2785; only returning FSW.
2786;
2787; @param 1 The instruction
2788;
2789; @param A0 FPU context (fxsave).
2790; @param A1 Where to store the output FSW.
2791; @param A2 Pointer to the 80-bit value.
2792; @param A3 Pointer to the 64-bit value.
2793;
2794%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2796 PROLOGUE_4_ARGS
2797 sub xSP, 20h
2798
2799 fninit
2800 fld tword [A2]
2801 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2802 %1 word [A3]
2803
2804 fnstsw word [A1]
2805
2806 fninit
2807 add xSP, 20h
2808 EPILOGUE_4_ARGS
2809ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2810%endmacro
2811
2812IEMIMPL_FPU_R80_BY_I16_FSW ficom
2813
2814
2815
2816;
2817;---------------------- 32-bit signed integer operations ----------------------
2818;
2819
2820
2821;;
2822; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2823;
2824; @param A0 FPU context (fxsave).
2825; @param A1 Pointer to a IEMFPURESULT for the output.
2826; @param A2 Pointer to the 32-bit floating point value to convert.
2827;
2828BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2829 PROLOGUE_3_ARGS
2830 sub xSP, 20h
2831
2832 fninit
2833 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2834 fild dword [A2]
2835
2836 fnstsw word [A1 + IEMFPURESULT.FSW]
2837 fnclex
2838 fstp tword [A1 + IEMFPURESULT.r80Result]
2839
2840 fninit
2841 add xSP, 20h
2842 EPILOGUE_3_ARGS
2843ENDPROC iemAImpl_fild_r80_from_i32
2844
2845
2846;;
2847; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2848;
2849; @param A0 FPU context (fxsave).
2850; @param A1 Where to return the output FSW.
2851; @param A2 Where to store the 32-bit signed integer value.
2852; @param A3 Pointer to the 80-bit value.
2853;
2854BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2855 PROLOGUE_4_ARGS
2856 sub xSP, 20h
2857
2858 fninit
2859 fld tword [A3]
2860 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2861 fistp dword [A2]
2862
2863 fnstsw word [A1]
2864
2865 fninit
2866 add xSP, 20h
2867 EPILOGUE_4_ARGS
2868ENDPROC iemAImpl_fist_r80_to_i32
2869
2870
2871;;
2872; Store a 80-bit floating point value (register) as a 32-bit signed integer
2873; (memory) with truncation.
2874;
2875; @param A0 FPU context (fxsave).
2876; @param A1 Where to return the output FSW.
2877; @param A2 Where to store the 32-bit signed integer value.
2878; @param A3 Pointer to the 80-bit value.
2879;
2880BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2881 PROLOGUE_4_ARGS
2882 sub xSP, 20h
2883
2884 fninit
2885 fld tword [A3]
2886 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2887 fisttp dword [A2]
2888
2889 fnstsw word [A1]
2890
2891 fninit
2892 add xSP, 20h
2893 EPILOGUE_4_ARGS
2894ENDPROC iemAImpl_fistt_r80_to_i32
2895
2896
2897;;
2898; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2899;
2900; @param 1 The instruction
2901;
2902; @param A0 FPU context (fxsave).
2903; @param A1 Pointer to a IEMFPURESULT for the output.
2904; @param A2 Pointer to the 80-bit value.
2905; @param A3 Pointer to the 32-bit value.
2906;
2907%macro IEMIMPL_FPU_R80_BY_I32 1
2908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2909 PROLOGUE_4_ARGS
2910 sub xSP, 20h
2911
2912 fninit
2913 fld tword [A2]
2914 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2915 %1 dword [A3]
2916
2917 fnstsw word [A1 + IEMFPURESULT.FSW]
2918 fnclex
2919 fstp tword [A1 + IEMFPURESULT.r80Result]
2920
2921 fninit
2922 add xSP, 20h
2923 EPILOGUE_4_ARGS
2924ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2925%endmacro
2926
2927IEMIMPL_FPU_R80_BY_I32 fiadd
2928IEMIMPL_FPU_R80_BY_I32 fimul
2929IEMIMPL_FPU_R80_BY_I32 fisub
2930IEMIMPL_FPU_R80_BY_I32 fisubr
2931IEMIMPL_FPU_R80_BY_I32 fidiv
2932IEMIMPL_FPU_R80_BY_I32 fidivr
2933
2934
2935;;
2936; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2937; only returning FSW.
2938;
2939; @param 1 The instruction
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Where to store the output FSW.
2943; @param A2 Pointer to the 80-bit value.
2944; @param A3 Pointer to the 64-bit value.
2945;
2946%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2947BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2948 PROLOGUE_4_ARGS
2949 sub xSP, 20h
2950
2951 fninit
2952 fld tword [A2]
2953 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2954 %1 dword [A3]
2955
2956 fnstsw word [A1]
2957
2958 fninit
2959 add xSP, 20h
2960 EPILOGUE_4_ARGS
2961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2962%endmacro
2963
2964IEMIMPL_FPU_R80_BY_I32_FSW ficom
2965
2966
2967
2968;
2969;---------------------- 64-bit signed integer operations ----------------------
2970;
2971
2972
2973;;
2974; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 64-bit floating point value to convert.
2979;
2980BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2981 PROLOGUE_3_ARGS
2982 sub xSP, 20h
2983
2984 fninit
2985 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2986 fild qword [A2]
2987
2988 fnstsw word [A1 + IEMFPURESULT.FSW]
2989 fnclex
2990 fstp tword [A1 + IEMFPURESULT.r80Result]
2991
2992 fninit
2993 add xSP, 20h
2994 EPILOGUE_3_ARGS
2995ENDPROC iemAImpl_fild_r80_from_i64
2996
2997
2998;;
2999; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3000;
3001; @param A0 FPU context (fxsave).
3002; @param A1 Where to return the output FSW.
3003; @param A2 Where to store the 64-bit signed integer value.
3004; @param A3 Pointer to the 80-bit value.
3005;
3006BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3007 PROLOGUE_4_ARGS
3008 sub xSP, 20h
3009
3010 fninit
3011 fld tword [A3]
3012 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013 fistp qword [A2]
3014
3015 fnstsw word [A1]
3016
3017 fninit
3018 add xSP, 20h
3019 EPILOGUE_4_ARGS
3020ENDPROC iemAImpl_fist_r80_to_i64
3021
3022
3023;;
3024; Store a 80-bit floating point value (register) as a 64-bit signed integer
3025; (memory) with truncation.
3026;
3027; @param A0 FPU context (fxsave).
3028; @param A1 Where to return the output FSW.
3029; @param A2 Where to store the 64-bit signed integer value.
3030; @param A3 Pointer to the 80-bit value.
3031;
3032BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3033 PROLOGUE_4_ARGS
3034 sub xSP, 20h
3035
3036 fninit
3037 fld tword [A3]
3038 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3039 fisttp qword [A2]
3040
3041 fnstsw word [A1]
3042
3043 fninit
3044 add xSP, 20h
3045 EPILOGUE_4_ARGS
3046ENDPROC iemAImpl_fistt_r80_to_i64
3047
3048
3049
3050;
3051;---------------------- 32-bit floating point operations ----------------------
3052;
3053
3054;;
3055; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3056;
3057; @param A0 FPU context (fxsave).
3058; @param A1 Pointer to a IEMFPURESULT for the output.
3059; @param A2 Pointer to the 32-bit floating point value to convert.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3062 PROLOGUE_3_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3067 fld dword [A2]
3068
3069 fnstsw word [A1 + IEMFPURESULT.FSW]
3070 fnclex
3071 fstp tword [A1 + IEMFPURESULT.r80Result]
3072
3073 fninit
3074 add xSP, 20h
3075 EPILOGUE_3_ARGS
3076ENDPROC iemAImpl_fld_r80_from_r32
3077
3078
3079;;
3080; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3081;
3082; @param A0 FPU context (fxsave).
3083; @param A1 Where to return the output FSW.
3084; @param A2 Where to store the 32-bit value.
3085; @param A3 Pointer to the 80-bit value.
3086;
3087BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3088 PROLOGUE_4_ARGS
3089 sub xSP, 20h
3090
3091 fninit
3092 fld tword [A3]
3093 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3094 fst dword [A2]
3095
3096 fnstsw word [A1]
3097
3098 fninit
3099 add xSP, 20h
3100 EPILOGUE_4_ARGS
3101ENDPROC iemAImpl_fst_r80_to_r32
3102
3103
3104;;
3105; FPU instruction working on one 80-bit and one 32-bit floating point value.
3106;
3107; @param 1 The instruction
3108;
3109; @param A0 FPU context (fxsave).
3110; @param A1 Pointer to a IEMFPURESULT for the output.
3111; @param A2 Pointer to the 80-bit value.
3112; @param A3 Pointer to the 32-bit value.
3113;
3114%macro IEMIMPL_FPU_R80_BY_R32 1
3115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3116 PROLOGUE_4_ARGS
3117 sub xSP, 20h
3118
3119 fninit
3120 fld tword [A2]
3121 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3122 %1 dword [A3]
3123
3124 fnstsw word [A1 + IEMFPURESULT.FSW]
3125 fnclex
3126 fstp tword [A1 + IEMFPURESULT.r80Result]
3127
3128 fninit
3129 add xSP, 20h
3130 EPILOGUE_4_ARGS
3131ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3132%endmacro
3133
3134IEMIMPL_FPU_R80_BY_R32 fadd
3135IEMIMPL_FPU_R80_BY_R32 fmul
3136IEMIMPL_FPU_R80_BY_R32 fsub
3137IEMIMPL_FPU_R80_BY_R32 fsubr
3138IEMIMPL_FPU_R80_BY_R32 fdiv
3139IEMIMPL_FPU_R80_BY_R32 fdivr
3140
3141
3142;;
3143; FPU instruction working on one 80-bit and one 32-bit floating point value,
3144; only returning FSW.
3145;
3146; @param 1 The instruction
3147;
3148; @param A0 FPU context (fxsave).
3149; @param A1 Where to store the output FSW.
3150; @param A2 Pointer to the 80-bit value.
3151; @param A3 Pointer to the 64-bit value.
3152;
3153%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3154BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3155 PROLOGUE_4_ARGS
3156 sub xSP, 20h
3157
3158 fninit
3159 fld tword [A2]
3160 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3161 %1 dword [A3]
3162
3163 fnstsw word [A1]
3164
3165 fninit
3166 add xSP, 20h
3167 EPILOGUE_4_ARGS
3168ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3169%endmacro
3170
3171IEMIMPL_FPU_R80_BY_R32_FSW fcom
3172
3173
3174
3175;
3176;---------------------- 64-bit floating point operations ----------------------
3177;
3178
3179;;
3180; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3181;
3182; @param A0 FPU context (fxsave).
3183; @param A1 Pointer to a IEMFPURESULT for the output.
3184; @param A2 Pointer to the 64-bit floating point value to convert.
3185;
3186BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3187 PROLOGUE_3_ARGS
3188 sub xSP, 20h
3189
3190 fninit
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fld qword [A2]
3193
3194 fnstsw word [A1 + IEMFPURESULT.FSW]
3195 fnclex
3196 fstp tword [A1 + IEMFPURESULT.r80Result]
3197
3198 fninit
3199 add xSP, 20h
3200 EPILOGUE_3_ARGS
3201ENDPROC iemAImpl_fld_r80_from_r64
3202
3203
3204;;
3205; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3206;
3207; @param A0 FPU context (fxsave).
3208; @param A1 Where to return the output FSW.
3209; @param A2 Where to store the 64-bit value.
3210; @param A3 Pointer to the 80-bit value.
3211;
3212BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3213 PROLOGUE_4_ARGS
3214 sub xSP, 20h
3215
3216 fninit
3217 fld tword [A3]
3218 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3219 fst qword [A2]
3220
3221 fnstsw word [A1]
3222
3223 fninit
3224 add xSP, 20h
3225 EPILOGUE_4_ARGS
3226ENDPROC iemAImpl_fst_r80_to_r64
3227
3228
3229;;
3230; FPU instruction working on one 80-bit and one 64-bit floating point value.
3231;
3232; @param 1 The instruction
3233;
3234; @param A0 FPU context (fxsave).
3235; @param A1 Pointer to a IEMFPURESULT for the output.
3236; @param A2 Pointer to the 80-bit value.
3237; @param A3 Pointer to the 64-bit value.
3238;
3239%macro IEMIMPL_FPU_R80_BY_R64 1
3240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3241 PROLOGUE_4_ARGS
3242 sub xSP, 20h
3243
3244 fninit
3245 fld tword [A2]
3246 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3247 %1 qword [A3]
3248
3249 fnstsw word [A1 + IEMFPURESULT.FSW]
3250 fnclex
3251 fstp tword [A1 + IEMFPURESULT.r80Result]
3252
3253 fninit
3254 add xSP, 20h
3255 EPILOGUE_4_ARGS
3256ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3257%endmacro
3258
3259IEMIMPL_FPU_R80_BY_R64 fadd
3260IEMIMPL_FPU_R80_BY_R64 fmul
3261IEMIMPL_FPU_R80_BY_R64 fsub
3262IEMIMPL_FPU_R80_BY_R64 fsubr
3263IEMIMPL_FPU_R80_BY_R64 fdiv
3264IEMIMPL_FPU_R80_BY_R64 fdivr
3265
3266;;
3267; FPU instruction working on one 80-bit and one 64-bit floating point value,
3268; only returning FSW.
3269;
3270; @param 1 The instruction
3271;
3272; @param A0 FPU context (fxsave).
3273; @param A1 Where to store the output FSW.
3274; @param A2 Pointer to the 80-bit value.
3275; @param A3 Pointer to the 64-bit value.
3276;
3277%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3279 PROLOGUE_4_ARGS
3280 sub xSP, 20h
3281
3282 fninit
3283 fld tword [A2]
3284 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3285 %1 qword [A3]
3286
3287 fnstsw word [A1]
3288
3289 fninit
3290 add xSP, 20h
3291 EPILOGUE_4_ARGS
3292ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3293%endmacro
3294
3295IEMIMPL_FPU_R80_BY_R64_FSW fcom
3296
3297
3298
3299;
3300;---------------------- 80-bit floating point operations ----------------------
3301;
3302
3303;;
3304; Loads a 80-bit floating point register value from memory.
3305;
3306; @param A0 FPU context (fxsave).
3307; @param A1 Pointer to a IEMFPURESULT for the output.
3308; @param A2 Pointer to the 80-bit floating point value to load.
3309;
3310BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3311 PROLOGUE_3_ARGS
3312 sub xSP, 20h
3313
3314 fninit
3315 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3316 fld tword [A2]
3317
3318 fnstsw word [A1 + IEMFPURESULT.FSW]
3319 fnclex
3320 fstp tword [A1 + IEMFPURESULT.r80Result]
3321
3322 fninit
3323 add xSP, 20h
3324 EPILOGUE_3_ARGS
3325ENDPROC iemAImpl_fld_r80_from_r80
3326
3327
3328;;
3329; Store a 80-bit floating point register to memory
3330;
3331; @param A0 FPU context (fxsave).
3332; @param A1 Where to return the output FSW.
3333; @param A2 Where to store the 80-bit value.
3334; @param A3 Pointer to the 80-bit register value.
3335;
3336BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3337 PROLOGUE_4_ARGS
3338 sub xSP, 20h
3339
3340 fninit
3341 fld tword [A3]
3342 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3343 fstp tword [A2]
3344
3345 fnstsw word [A1]
3346
3347 fninit
3348 add xSP, 20h
3349 EPILOGUE_4_ARGS
3350ENDPROC iemAImpl_fst_r80_to_r80
3351
3352
3353;;
3354; Loads an 80-bit floating point register value in BCD format from memory.
3355;
3356; @param A0 FPU context (fxsave).
3357; @param A1 Pointer to a IEMFPURESULT for the output.
3358; @param A2 Pointer to the 80-bit BCD value to load.
3359;
3360BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3361 PROLOGUE_3_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3366 fbld tword [A2]
3367
3368 fnstsw word [A1 + IEMFPURESULT.FSW]
3369 fnclex
3370 fstp tword [A1 + IEMFPURESULT.r80Result]
3371
3372 fninit
3373 add xSP, 20h
3374 EPILOGUE_3_ARGS
3375ENDPROC iemAImpl_fld_r80_from_d80
3376
3377
3378;;
3379; Store a 80-bit floating point register to memory as BCD
3380;
3381; @param A0 FPU context (fxsave).
3382; @param A1 Where to return the output FSW.
3383; @param A2 Where to store the 80-bit BCD value.
3384; @param A3 Pointer to the 80-bit register value.
3385;
3386BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3387 PROLOGUE_4_ARGS
3388 sub xSP, 20h
3389
3390 fninit
3391 fld tword [A3]
3392 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3393 fbstp tword [A2]
3394
3395 fnstsw word [A1]
3396
3397 fninit
3398 add xSP, 20h
3399 EPILOGUE_4_ARGS
3400ENDPROC iemAImpl_fst_r80_to_d80
3401
3402
3403;;
3404; FPU instruction working on two 80-bit floating point values.
3405;
3406; @param 1 The instruction
3407;
3408; @param A0 FPU context (fxsave).
3409; @param A1 Pointer to a IEMFPURESULT for the output.
3410; @param A2 Pointer to the first 80-bit value (ST0)
3411; @param A3 Pointer to the second 80-bit value (STn).
3412;
3413%macro IEMIMPL_FPU_R80_BY_R80 2
3414BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3415 PROLOGUE_4_ARGS
3416 sub xSP, 20h
3417
3418 fninit
3419 fld tword [A3]
3420 fld tword [A2]
3421 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3422 %1 %2
3423
3424 fnstsw word [A1 + IEMFPURESULT.FSW]
3425 fnclex
3426 fstp tword [A1 + IEMFPURESULT.r80Result]
3427
3428 fninit
3429 add xSP, 20h
3430 EPILOGUE_4_ARGS
3431ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3432%endmacro
3433
3434IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3435IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3436IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3437IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3438IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3439IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3440IEMIMPL_FPU_R80_BY_R80 fprem, {}
3441IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3442IEMIMPL_FPU_R80_BY_R80 fscale, {}
3443
3444
3445;;
3446; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3447; storing the result in ST1 and popping the stack.
3448;
3449; @param 1 The instruction
3450;
3451; @param A0 FPU context (fxsave).
3452; @param A1 Pointer to a IEMFPURESULT for the output.
3453; @param A2 Pointer to the first 80-bit value (ST1).
3454; @param A3 Pointer to the second 80-bit value (ST0).
3455;
3456%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3458 PROLOGUE_4_ARGS
3459 sub xSP, 20h
3460
3461 fninit
3462 fld tword [A2]
3463 fld tword [A3]
3464 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3465 %1
3466
3467 fnstsw word [A1 + IEMFPURESULT.FSW]
3468 fnclex
3469 fstp tword [A1 + IEMFPURESULT.r80Result]
3470
3471 fninit
3472 add xSP, 20h
3473 EPILOGUE_4_ARGS
3474ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3475%endmacro
3476
3477IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3478IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3479IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3480
3481
3482;;
3483; FPU instruction working on two 80-bit floating point values, only
3484; returning FSW.
3485;
3486; @param 1 The instruction
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Pointer to a uint16_t for the resulting FSW.
3490; @param A2 Pointer to the first 80-bit value.
3491; @param A3 Pointer to the second 80-bit value.
3492;
3493%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3495 PROLOGUE_4_ARGS
3496 sub xSP, 20h
3497
3498 fninit
3499 fld tword [A3]
3500 fld tword [A2]
3501 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3502 %1 st0, st1
3503
3504 fnstsw word [A1]
3505
3506 fninit
3507 add xSP, 20h
3508 EPILOGUE_4_ARGS
3509ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3510%endmacro
3511
3512IEMIMPL_FPU_R80_BY_R80_FSW fcom
3513IEMIMPL_FPU_R80_BY_R80_FSW fucom
3514
3515
3516;;
3517; FPU instruction working on two 80-bit floating point values,
3518; returning FSW and EFLAGS (eax).
3519;
3520; @param 1 The instruction
3521;
3522; @returns EFLAGS in EAX.
3523; @param A0 FPU context (fxsave).
3524; @param A1 Pointer to a uint16_t for the resulting FSW.
3525; @param A2 Pointer to the first 80-bit value.
3526; @param A3 Pointer to the second 80-bit value.
3527;
3528%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3529BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3530 PROLOGUE_4_ARGS
3531 sub xSP, 20h
3532
3533 fninit
3534 fld tword [A3]
3535 fld tword [A2]
3536 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3537 %1 st1
3538
3539 fnstsw word [A1]
3540 pushf
3541 pop xAX
3542
3543 fninit
3544 add xSP, 20h
3545 EPILOGUE_4_ARGS
3546ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3547%endmacro
3548
3549IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3550IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3551
3552
3553;;
3554; FPU instruction working on one 80-bit floating point value.
3555;
3556; @param 1 The instruction
3557;
3558; @param A0 FPU context (fxsave).
3559; @param A1 Pointer to a IEMFPURESULT for the output.
3560; @param A2 Pointer to the 80-bit value.
3561;
3562%macro IEMIMPL_FPU_R80 1
3563BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3564 PROLOGUE_3_ARGS
3565 sub xSP, 20h
3566
3567 fninit
3568 fld tword [A2]
3569 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3570 %1
3571
3572 fnstsw word [A1 + IEMFPURESULT.FSW]
3573 fnclex
3574 fstp tword [A1 + IEMFPURESULT.r80Result]
3575
3576 fninit
3577 add xSP, 20h
3578 EPILOGUE_3_ARGS
3579ENDPROC iemAImpl_ %+ %1 %+ _r80
3580%endmacro
3581
3582IEMIMPL_FPU_R80 fchs
3583IEMIMPL_FPU_R80 fabs
3584IEMIMPL_FPU_R80 f2xm1
3585IEMIMPL_FPU_R80 fsqrt
3586IEMIMPL_FPU_R80 frndint
3587IEMIMPL_FPU_R80 fsin
3588IEMIMPL_FPU_R80 fcos
3589
3590
3591;;
3592; FPU instruction working on one 80-bit floating point value, only
3593; returning FSW.
3594;
3595; @param 1 The instruction
3596; @param 2 Non-zero to also restore FTW.
3597;
3598; @param A0 FPU context (fxsave).
3599; @param A1 Pointer to a uint16_t for the resulting FSW.
3600; @param A2 Pointer to the 80-bit value.
3601;
3602%macro IEMIMPL_FPU_R80_FSW 2
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3604 PROLOGUE_3_ARGS
3605 sub xSP, 20h
3606
3607 fninit
3608 fld tword [A2]
3609%if %2 != 0
3610 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3611%else
3612 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3613%endif
3614 %1
3615
3616 fnstsw word [A1]
3617
3618 fninit
3619 add xSP, 20h
3620 EPILOGUE_3_ARGS
3621ENDPROC iemAImpl_ %+ %1 %+ _r80
3622%endmacro
3623
3624IEMIMPL_FPU_R80_FSW ftst, 0
3625IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3626
3627
3628
3629;;
3630; FPU instruction loading a 80-bit floating point constant.
3631;
3632; @param 1 The instruction
3633;
3634; @param A0 FPU context (fxsave).
3635; @param A1 Pointer to a IEMFPURESULT for the output.
3636;
3637%macro IEMIMPL_FPU_R80_CONST 1
3638BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3639 PROLOGUE_2_ARGS
3640 sub xSP, 20h
3641
3642 fninit
3643 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3644 %1
3645
3646 fnstsw word [A1 + IEMFPURESULT.FSW]
3647 fnclex
3648 fstp tword [A1 + IEMFPURESULT.r80Result]
3649
3650 fninit
3651 add xSP, 20h
3652 EPILOGUE_2_ARGS
3653ENDPROC iemAImpl_ %+ %1 %+
3654%endmacro
3655
3656IEMIMPL_FPU_R80_CONST fld1
3657IEMIMPL_FPU_R80_CONST fldl2t
3658IEMIMPL_FPU_R80_CONST fldl2e
3659IEMIMPL_FPU_R80_CONST fldpi
3660IEMIMPL_FPU_R80_CONST fldlg2
3661IEMIMPL_FPU_R80_CONST fldln2
3662IEMIMPL_FPU_R80_CONST fldz
3663
3664
3665;;
3666; FPU instruction working on one 80-bit floating point value, outputing two.
3667;
3668; @param 1 The instruction
3669;
3670; @param A0 FPU context (fxsave).
3671; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3672; @param A2 Pointer to the 80-bit value.
3673;
3674%macro IEMIMPL_FPU_R80_R80 1
3675BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3676 PROLOGUE_3_ARGS
3677 sub xSP, 20h
3678
3679 fninit
3680 fld tword [A2]
3681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3682 %1
3683
3684 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3685 fnclex
3686 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3694%endmacro
3695
3696IEMIMPL_FPU_R80_R80 fptan
3697IEMIMPL_FPU_R80_R80 fxtract
3698IEMIMPL_FPU_R80_R80 fsincos
3699
3700
3701
3702
3703;---------------------- SSE and MMX Operations ----------------------
3704
3705;; @todo what do we need to do for MMX?
3706%macro IEMIMPL_MMX_PROLOGUE 0
3707%endmacro
3708%macro IEMIMPL_MMX_EPILOGUE 0
3709%endmacro
3710
3711;; @todo what do we need to do for SSE?
3712%macro IEMIMPL_SSE_PROLOGUE 0
3713%endmacro
3714%macro IEMIMPL_SSE_EPILOGUE 0
3715%endmacro
3716
3717;; @todo what do we need to do for AVX?
3718%macro IEMIMPL_AVX_PROLOGUE 0
3719%endmacro
3720%macro IEMIMPL_AVX_EPILOGUE 0
3721%endmacro
3722
3723
3724;;
3725; Media instruction working on two full sized registers.
3726;
3727; @param 1 The instruction
3728; @param 2 Whether there is an MMX variant (1) or not (0).
3729;
3730; @param A0 FPU context (fxsave).
3731; @param A1 Pointer to the first media register size operand (input/output).
3732; @param A2 Pointer to the second media register size operand (input).
3733;
3734%macro IEMIMPL_MEDIA_F2 2
3735%if %2 != 0
3736BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3737 PROLOGUE_3_ARGS
3738 IEMIMPL_MMX_PROLOGUE
3739
3740 movq mm0, [A1]
3741 movq mm1, [A2]
3742 %1 mm0, mm1
3743 movq [A1], mm0
3744
3745 IEMIMPL_MMX_EPILOGUE
3746 EPILOGUE_3_ARGS
3747ENDPROC iemAImpl_ %+ %1 %+ _u64
3748%endif
3749
3750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3751 PROLOGUE_3_ARGS
3752 IEMIMPL_SSE_PROLOGUE
3753
3754 movdqu xmm0, [A1]
3755 movdqu xmm1, [A2]
3756 %1 xmm0, xmm1
3757 movdqu [A1], xmm0
3758
3759 IEMIMPL_SSE_EPILOGUE
3760 EPILOGUE_3_ARGS
3761ENDPROC iemAImpl_ %+ %1 %+ _u128
3762%endmacro
3763
3764IEMIMPL_MEDIA_F2 pshufb, 1
3765IEMIMPL_MEDIA_F2 pand, 1
3766IEMIMPL_MEDIA_F2 pandn, 1
3767IEMIMPL_MEDIA_F2 por, 1
3768IEMIMPL_MEDIA_F2 pxor, 1
3769IEMIMPL_MEDIA_F2 pcmpeqb, 1
3770IEMIMPL_MEDIA_F2 pcmpeqw, 1
3771IEMIMPL_MEDIA_F2 pcmpeqd, 1
3772IEMIMPL_MEDIA_F2 pcmpeqq, 0
3773IEMIMPL_MEDIA_F2 pcmpgtb, 1
3774IEMIMPL_MEDIA_F2 pcmpgtw, 1
3775IEMIMPL_MEDIA_F2 pcmpgtd, 1
3776IEMIMPL_MEDIA_F2 pcmpgtq, 0
3777IEMIMPL_MEDIA_F2 paddb, 1
3778IEMIMPL_MEDIA_F2 paddw, 1
3779IEMIMPL_MEDIA_F2 paddd, 1
3780IEMIMPL_MEDIA_F2 paddq, 1
3781IEMIMPL_MEDIA_F2 paddsb, 1
3782IEMIMPL_MEDIA_F2 paddsw, 1
3783IEMIMPL_MEDIA_F2 paddusb, 1
3784IEMIMPL_MEDIA_F2 paddusw, 1
3785IEMIMPL_MEDIA_F2 psubb, 1
3786IEMIMPL_MEDIA_F2 psubw, 1
3787IEMIMPL_MEDIA_F2 psubd, 1
3788IEMIMPL_MEDIA_F2 psubq, 1
3789IEMIMPL_MEDIA_F2 psubsb, 1
3790IEMIMPL_MEDIA_F2 psubsw, 1
3791IEMIMPL_MEDIA_F2 psubusb, 1
3792IEMIMPL_MEDIA_F2 psubusw, 1
3793IEMIMPL_MEDIA_F2 pmullw, 1
3794IEMIMPL_MEDIA_F2 pmulld, 0
3795IEMIMPL_MEDIA_F2 pmulhw, 1
3796IEMIMPL_MEDIA_F2 pmaddwd, 1
3797IEMIMPL_MEDIA_F2 pminub, 1
3798IEMIMPL_MEDIA_F2 pminuw, 0
3799IEMIMPL_MEDIA_F2 pminud, 0
3800IEMIMPL_MEDIA_F2 pminsb, 0
3801IEMIMPL_MEDIA_F2 pminsw, 1
3802IEMIMPL_MEDIA_F2 pminsd, 0
3803IEMIMPL_MEDIA_F2 pmaxub, 1
3804IEMIMPL_MEDIA_F2 pmaxuw, 0
3805IEMIMPL_MEDIA_F2 pmaxud, 0
3806IEMIMPL_MEDIA_F2 pmaxsb, 0
3807IEMIMPL_MEDIA_F2 pmaxsw, 1
3808IEMIMPL_MEDIA_F2 pmaxsd, 0
3809IEMIMPL_MEDIA_F2 pabsb, 1
3810IEMIMPL_MEDIA_F2 pabsw, 1
3811IEMIMPL_MEDIA_F2 pabsd, 1
3812IEMIMPL_MEDIA_F2 psignb, 1
3813IEMIMPL_MEDIA_F2 psignw, 1
3814IEMIMPL_MEDIA_F2 psignd, 1
3815IEMIMPL_MEDIA_F2 phaddw, 1
3816IEMIMPL_MEDIA_F2 phaddd, 1
3817IEMIMPL_MEDIA_F2 phsubw, 1
3818IEMIMPL_MEDIA_F2 phsubd, 1
3819IEMIMPL_MEDIA_F2 phaddsw, 1
3820IEMIMPL_MEDIA_F2 phsubsw, 1
3821IEMIMPL_MEDIA_F2 pmaddubsw, 1
3822IEMIMPL_MEDIA_F2 pmulhrsw, 1
3823IEMIMPL_MEDIA_F2 pmuludq, 1
3824
3825
3826;;
3827; Media instruction working on two full sized registers, but no FXSAVE state argument.
3828;
3829; @param 1 The instruction
3830; @param 2 Whether there is an MMX variant (1) or not (0).
3831;
3832; @param A0 Pointer to the first media register size operand (input/output).
3833; @param A1 Pointer to the second media register size operand (input).
3834;
3835%macro IEMIMPL_MEDIA_OPT_F2 2
3836%if %2 != 0
3837BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3838 PROLOGUE_2_ARGS
3839 IEMIMPL_MMX_PROLOGUE
3840
3841 movq mm0, [A0]
3842 movq mm1, [A1]
3843 %1 mm0, mm1
3844 movq [A0], mm0
3845
3846 IEMIMPL_MMX_EPILOGUE
3847 EPILOGUE_2_ARGS
3848ENDPROC iemAImpl_ %+ %1 %+ _u64
3849%endif
3850
3851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3852 PROLOGUE_2_ARGS
3853 IEMIMPL_SSE_PROLOGUE
3854
3855 movdqu xmm0, [A0]
3856 movdqu xmm1, [A1]
3857 %1 xmm0, xmm1
3858 movdqu [A0], xmm0
3859
3860 IEMIMPL_SSE_EPILOGUE
3861 EPILOGUE_2_ARGS
3862ENDPROC iemAImpl_ %+ %1 %+ _u128
3863%endmacro
3864
3865IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3866IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3867IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3868IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3869IEMIMPL_MEDIA_OPT_F2 psllw, 1
3870IEMIMPL_MEDIA_OPT_F2 pslld, 1
3871IEMIMPL_MEDIA_OPT_F2 psllq, 1
3872IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3873IEMIMPL_MEDIA_OPT_F2 psrld, 1
3874IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3875IEMIMPL_MEDIA_OPT_F2 psraw, 1
3876IEMIMPL_MEDIA_OPT_F2 psrad, 1
3877IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3878IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3879IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3880IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3881IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3882IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3883IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3884IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3885IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3886IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3887IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3888IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3889IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3890IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3891IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3892IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3893IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3894IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3895IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3896IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3897
3898;;
3899; Media instruction working on one full sized and one half sized register (lower half).
3900;
3901; @param 1 The instruction
3902; @param 2 1 if MMX is included, 0 if not.
3903;
3904; @param A0 Pointer to the first full sized media register operand (input/output).
3905; @param A1 Pointer to the second half sized media register operand (input).
3906;
3907%macro IEMIMPL_MEDIA_F1L1 2
3908 %if %2 != 0
3909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3910 PROLOGUE_2_ARGS
3911 IEMIMPL_MMX_PROLOGUE
3912
3913 movq mm0, [A0]
3914 movq mm1, [A1]
3915 %1 mm0, mm1
3916 movq [A0], mm0
3917
3918 IEMIMPL_MMX_EPILOGUE
3919 EPILOGUE_2_ARGS
3920ENDPROC iemAImpl_ %+ %1 %+ _u64
3921 %endif
3922
3923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3924 PROLOGUE_2_ARGS
3925 IEMIMPL_SSE_PROLOGUE
3926
3927 movdqu xmm0, [A0]
3928 movdqu xmm1, [A1]
3929 %1 xmm0, xmm1
3930 movdqu [A0], xmm0
3931
3932 IEMIMPL_SSE_EPILOGUE
3933 EPILOGUE_2_ARGS
3934ENDPROC iemAImpl_ %+ %1 %+ _u128
3935%endmacro
3936
3937IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3938IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3939IEMIMPL_MEDIA_F1L1 punpckldq, 1
3940IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3941
3942
3943;;
3944; Media instruction working two half sized input registers (lower half) and a full sized
3945; destination register (vpunpckh*).
3946;
3947; @param 1 The instruction
3948;
3949; @param A0 Pointer to the destination register (full sized, output only).
3950; @param A1 Pointer to the first full sized media source register operand, where we
3951; will only use the lower half as input - but we'll be loading it in full.
3952; @param A2 Pointer to the second full sized media source register operand, where we
3953; will only use the lower half as input - but we'll be loading it in full.
3954;
3955%macro IEMIMPL_MEDIA_F1L1L1 1
3956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3957 PROLOGUE_3_ARGS
3958 IEMIMPL_AVX_PROLOGUE
3959
3960 vmovdqu xmm0, [A1]
3961 vmovdqu xmm1, [A2]
3962 %1 xmm0, xmm0, xmm1
3963 vmovdqu [A0], xmm0
3964
3965 IEMIMPL_AVX_PROLOGUE
3966 EPILOGUE_3_ARGS
3967ENDPROC iemAImpl_ %+ %1 %+ _u128
3968
3969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3970 PROLOGUE_3_ARGS
3971 IEMIMPL_AVX_PROLOGUE
3972
3973 vmovdqu ymm0, [A1]
3974 vmovdqu ymm1, [A2]
3975 %1 ymm0, ymm0, ymm1
3976 vmovdqu [A0], ymm0
3977
3978 IEMIMPL_AVX_PROLOGUE
3979 EPILOGUE_3_ARGS
3980ENDPROC iemAImpl_ %+ %1 %+ _u256
3981%endmacro
3982
3983IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3984IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3985IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3986IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3987
3988
3989;;
3990; Media instruction working on one full sized and one half sized register (high half).
3991;
3992; @param 1 The instruction
3993; @param 2 1 if MMX is included, 0 if not.
3994;
3995; @param A0 Pointer to the first full sized media register operand (input/output).
3996; @param A1 Pointer to the second full sized media register operand, where we
3997; will only use the upper half as input - but we'll load it in full.
3998;
3999%macro IEMIMPL_MEDIA_F1H1 2
4000IEMIMPL_MEDIA_F1L1 %1, %2
4001%endmacro
4002
4003IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4004IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4005IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4006IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4007
4008
4009;;
4010; Media instruction working two half sized input registers (high half) and a full sized
4011; destination register (vpunpckh*).
4012;
4013; @param 1 The instruction
4014;
4015; @param A0 Pointer to the destination register (full sized, output only).
4016; @param A1 Pointer to the first full sized media source register operand, where we
4017; will only use the upper half as input - but we'll be loading it in full.
4018; @param A2 Pointer to the second full sized media source register operand, where we
4019; will only use the upper half as input - but we'll be loading it in full.
4020;
4021%macro IEMIMPL_MEDIA_F1H1H1 1
4022IEMIMPL_MEDIA_F1L1L1 %1
4023%endmacro
4024
4025IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4026IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4027IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4028IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4029
4030
4031;
4032; Shufflers with evil 8-bit immediates.
4033;
4034
4035BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4036 PROLOGUE_3_ARGS
4037 IEMIMPL_MMX_PROLOGUE
4038
4039 movzx A2, A2_8 ; must clear top bits
4040 movq mm1, [A1]
4041 movq mm0, mm0 ; paranoia!
4042 lea T1, [.imm0 xWrtRIP]
4043 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4044 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4045 %else
4046 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4047 %endif
4048 lea T1, [T1 + T0]
4049 IBT_NOTRACK
4050 call T1
4051 movq [A0], mm0
4052
4053 IEMIMPL_MMX_EPILOGUE
4054 EPILOGUE_3_ARGS
4055%assign bImm 0
4056%rep 256
4057.imm %+ bImm:
4058 IBT_ENDBRxx_WITHOUT_NOTRACK
4059 pshufw mm0, mm1, bImm
4060 ret
4061 %assign bImm bImm + 1
4062%endrep
4063.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4064ENDPROC iemAImpl_pshufw_u64
4065
4066
4067%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4069 PROLOGUE_3_ARGS
4070 IEMIMPL_SSE_PROLOGUE
4071
4072 movzx A2, A2_8 ; must clear top bits
4073 movdqu xmm1, [A1]
4074 movdqu xmm0, xmm1 ; paranoia!
4075 lea T1, [.imm0 xWrtRIP]
4076 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4077 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4078 %else
4079 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4080 %endif
4081 lea T1, [T1 + T0*2]
4082 IBT_NOTRACK
4083 call T1
4084 movdqu [A0], xmm0
4085
4086 IEMIMPL_SSE_EPILOGUE
4087 EPILOGUE_3_ARGS
4088
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, xmm1, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4102IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4103IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4104
4105
4106%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4108 PROLOGUE_3_ARGS
4109 IEMIMPL_SSE_PROLOGUE
4110
4111 movzx A2, A2_8 ; must clear top bits
4112 vmovdqu ymm1, [A1]
4113 vmovdqu ymm0, ymm1 ; paranoia!
4114 lea T1, [.imm0 xWrtRIP]
4115 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4116 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4117 %else
4118 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4119 %endif
4120 lea T1, [T1 + T0*2]
4121 IBT_NOTRACK
4122 call T1
4123 vmovdqu [A0], ymm0
4124
4125 IEMIMPL_SSE_EPILOGUE
4126 EPILOGUE_3_ARGS
4127 %assign bImm 0
4128 %rep 256
4129.imm %+ bImm:
4130 IBT_ENDBRxx_WITHOUT_NOTRACK
4131 %1 ymm0, ymm1, bImm
4132 ret
4133 %assign bImm bImm + 1
4134 %endrep
4135.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4136ENDPROC iemAImpl_ %+ %1 %+ _u256
4137%endmacro
4138
4139IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4140IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4141IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4142
4143
4144;
4145; Shifts with evil 8-bit immediates.
4146;
4147
4148%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4149BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4150 PROLOGUE_2_ARGS
4151 IEMIMPL_MMX_PROLOGUE
4152
4153 movzx A1, A1_8 ; must clear top bits
4154 movq mm0, [A0]
4155 lea T1, [.imm0 xWrtRIP]
4156 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4157 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4158 %else
4159 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4160 %endif
4161 lea T1, [T1 + T0]
4162 IBT_NOTRACK
4163 call T1
4164 movq [A0], mm0
4165
4166 IEMIMPL_MMX_EPILOGUE
4167 EPILOGUE_2_ARGS
4168%assign bImm 0
4169%rep 256
4170.imm %+ bImm:
4171 IBT_ENDBRxx_WITHOUT_NOTRACK
4172 %1 mm0, bImm
4173 ret
4174 %assign bImm bImm + 1
4175%endrep
4176.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4177ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4178%endmacro
4179
4180IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4181IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4182IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4183IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4184IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4185IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4186IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4187IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4188
4189
4190%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4192 PROLOGUE_2_ARGS
4193 IEMIMPL_SSE_PROLOGUE
4194
4195 movzx A1, A1_8 ; must clear top bits
4196 movdqu xmm0, [A0]
4197 lea T1, [.imm0 xWrtRIP]
4198 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4199 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4200 %else
4201 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4202 %endif
4203 lea T1, [T1 + T0*2]
4204 IBT_NOTRACK
4205 call T1
4206 movdqu [A0], xmm0
4207
4208 IEMIMPL_SSE_EPILOGUE
4209 EPILOGUE_2_ARGS
4210 %assign bImm 0
4211 %rep 256
4212.imm %+ bImm:
4213 IBT_ENDBRxx_WITHOUT_NOTRACK
4214 %1 xmm0, bImm
4215 ret
4216 %assign bImm bImm + 1
4217 %endrep
4218.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4219ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4220%endmacro
4221
4222IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4223IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4224IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4225IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4226IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4227IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4228IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4229IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4230IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4231IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4232
4233
4234;
4235; Move byte mask.
4236;
4237
4238BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4239 PROLOGUE_2_ARGS
4240 IEMIMPL_MMX_PROLOGUE
4241
4242 movq mm1, [A1]
4243 pmovmskb T0, mm1
4244 mov [A0], T0
4245%ifdef RT_ARCH_X86
4246 mov dword [A0 + 4], 0
4247%endif
4248 IEMIMPL_MMX_EPILOGUE
4249 EPILOGUE_2_ARGS
4250ENDPROC iemAImpl_pmovmskb_u64
4251
4252BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4253 PROLOGUE_2_ARGS
4254 IEMIMPL_SSE_PROLOGUE
4255
4256 movdqu xmm1, [A1]
4257 pmovmskb T0, xmm1
4258 mov [A0], T0
4259%ifdef RT_ARCH_X86
4260 mov dword [A0 + 4], 0
4261%endif
4262 IEMIMPL_SSE_EPILOGUE
4263 EPILOGUE_2_ARGS
4264ENDPROC iemAImpl_pmovmskb_u128
4265
4266BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4267 PROLOGUE_2_ARGS
4268 IEMIMPL_AVX_PROLOGUE
4269
4270 vmovdqu ymm1, [A1]
4271 vpmovmskb T0, ymm1
4272 mov [A0], T0
4273%ifdef RT_ARCH_X86
4274 mov dword [A0 + 4], 0
4275%endif
4276 IEMIMPL_AVX_EPILOGUE
4277 EPILOGUE_2_ARGS
4278ENDPROC iemAImpl_vpmovmskb_u256
4279
4280
4281;;
4282; Media instruction working on two full sized source registers and one destination (AVX).
4283;
4284; @param 1 The instruction
4285;
4286; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4287; @param A1 Pointer to the destination media register size operand (output).
4288; @param A2 Pointer to the first source media register size operand (input).
4289; @param A3 Pointer to the second source media register size operand (input).
4290;
4291%macro IEMIMPL_MEDIA_F3 1
4292BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4293 PROLOGUE_4_ARGS
4294 IEMIMPL_AVX_PROLOGUE
4295
4296 vmovdqu xmm0, [A2]
4297 vmovdqu xmm1, [A3]
4298 %1 xmm0, xmm0, xmm1
4299 vmovdqu [A1], xmm0
4300
4301 IEMIMPL_AVX_PROLOGUE
4302 EPILOGUE_4_ARGS
4303ENDPROC iemAImpl_ %+ %1 %+ _u128
4304
4305BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4306 PROLOGUE_4_ARGS
4307 IEMIMPL_AVX_PROLOGUE
4308
4309 vmovdqu ymm0, [A2]
4310 vmovdqu ymm1, [A3]
4311 %1 ymm0, ymm0, ymm1
4312 vmovdqu [A1], ymm0
4313
4314 IEMIMPL_AVX_PROLOGUE
4315 EPILOGUE_4_ARGS
4316ENDPROC iemAImpl_ %+ %1 %+ _u256
4317%endmacro
4318
4319IEMIMPL_MEDIA_F3 vpshufb
4320IEMIMPL_MEDIA_F3 vpand
4321IEMIMPL_MEDIA_F3 vpminub
4322IEMIMPL_MEDIA_F3 vpminuw
4323IEMIMPL_MEDIA_F3 vpminud
4324IEMIMPL_MEDIA_F3 vpminsb
4325IEMIMPL_MEDIA_F3 vpminsw
4326IEMIMPL_MEDIA_F3 vpminsd
4327IEMIMPL_MEDIA_F3 vpmaxub
4328IEMIMPL_MEDIA_F3 vpmaxuw
4329IEMIMPL_MEDIA_F3 vpmaxud
4330IEMIMPL_MEDIA_F3 vpmaxsb
4331IEMIMPL_MEDIA_F3 vpmaxsw
4332IEMIMPL_MEDIA_F3 vpmaxsd
4333IEMIMPL_MEDIA_F3 vpandn
4334IEMIMPL_MEDIA_F3 vpor
4335IEMIMPL_MEDIA_F3 vpxor
4336IEMIMPL_MEDIA_F3 vpcmpeqb
4337IEMIMPL_MEDIA_F3 vpcmpeqw
4338IEMIMPL_MEDIA_F3 vpcmpeqd
4339IEMIMPL_MEDIA_F3 vpcmpeqq
4340IEMIMPL_MEDIA_F3 vpcmpgtb
4341IEMIMPL_MEDIA_F3 vpcmpgtw
4342IEMIMPL_MEDIA_F3 vpcmpgtd
4343IEMIMPL_MEDIA_F3 vpcmpgtq
4344IEMIMPL_MEDIA_F3 vpaddb
4345IEMIMPL_MEDIA_F3 vpaddw
4346IEMIMPL_MEDIA_F3 vpaddd
4347IEMIMPL_MEDIA_F3 vpaddq
4348IEMIMPL_MEDIA_F3 vpsubb
4349IEMIMPL_MEDIA_F3 vpsubw
4350IEMIMPL_MEDIA_F3 vpsubd
4351IEMIMPL_MEDIA_F3 vpsubq
4352
4353
4354;;
4355; Media instruction working on two full sized source registers and one destination (AVX),
4356; but no XSAVE state pointer argument.
4357;
4358; @param 1 The instruction
4359;
4360; @param A0 Pointer to the destination media register size operand (output).
4361; @param A1 Pointer to the first source media register size operand (input).
4362; @param A2 Pointer to the second source media register size operand (input).
4363;
4364%macro IEMIMPL_MEDIA_OPT_F3 1
4365BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4366 PROLOGUE_3_ARGS
4367 IEMIMPL_AVX_PROLOGUE
4368
4369 vmovdqu xmm0, [A1]
4370 vmovdqu xmm1, [A2]
4371 %1 xmm0, xmm0, xmm1
4372 vmovdqu [A0], xmm0
4373
4374 IEMIMPL_AVX_PROLOGUE
4375 EPILOGUE_3_ARGS
4376ENDPROC iemAImpl_ %+ %1 %+ _u128
4377
4378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4379 PROLOGUE_3_ARGS
4380 IEMIMPL_AVX_PROLOGUE
4381
4382 vmovdqu ymm0, [A1]
4383 vmovdqu ymm1, [A2]
4384 %1 ymm0, ymm0, ymm1
4385 vmovdqu [A0], ymm0
4386
4387 IEMIMPL_AVX_PROLOGUE
4388 EPILOGUE_3_ARGS
4389ENDPROC iemAImpl_ %+ %1 %+ _u256
4390%endmacro
4391
4392IEMIMPL_MEDIA_OPT_F3 vpacksswb
4393IEMIMPL_MEDIA_OPT_F3 vpackssdw
4394IEMIMPL_MEDIA_OPT_F3 vpackuswb
4395IEMIMPL_MEDIA_OPT_F3 vpackusdw
4396IEMIMPL_MEDIA_OPT_F3 vpmullw
4397IEMIMPL_MEDIA_OPT_F3 vpmulld
4398IEMIMPL_MEDIA_OPT_F3 vpmulhw
4399IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4400IEMIMPL_MEDIA_OPT_F3 vpavgb
4401IEMIMPL_MEDIA_OPT_F3 vpavgw
4402IEMIMPL_MEDIA_OPT_F3 vpsignb
4403IEMIMPL_MEDIA_OPT_F3 vpsignw
4404IEMIMPL_MEDIA_OPT_F3 vpsignd
4405IEMIMPL_MEDIA_OPT_F3 vphaddw
4406IEMIMPL_MEDIA_OPT_F3 vphaddd
4407IEMIMPL_MEDIA_OPT_F3 vphsubw
4408IEMIMPL_MEDIA_OPT_F3 vphsubd
4409IEMIMPL_MEDIA_OPT_F3 vphaddsw
4410IEMIMPL_MEDIA_OPT_F3 vphsubsw
4411IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4412IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4413IEMIMPL_MEDIA_OPT_F3 vpsadbw
4414IEMIMPL_MEDIA_OPT_F3 vpmuldq
4415IEMIMPL_MEDIA_OPT_F3 vpmuludq
4416IEMIMPL_MEDIA_OPT_F3 vunpcklps
4417IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4418IEMIMPL_MEDIA_OPT_F3 vunpckhps
4419IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4420IEMIMPL_MEDIA_OPT_F3 vpsubsb
4421IEMIMPL_MEDIA_OPT_F3 vpsubsw
4422IEMIMPL_MEDIA_OPT_F3 vpsubusb
4423IEMIMPL_MEDIA_OPT_F3 vpsubusw
4424IEMIMPL_MEDIA_OPT_F3 vpaddusb
4425IEMIMPL_MEDIA_OPT_F3 vpaddusw
4426IEMIMPL_MEDIA_OPT_F3 vpaddsb
4427IEMIMPL_MEDIA_OPT_F3 vpaddsw
4428IEMIMPL_MEDIA_OPT_F3 vpermilps
4429IEMIMPL_MEDIA_OPT_F3 vpermilpd
4430
4431;;
4432; Media instruction working on one full sized source register, one full sized destination
4433; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4434; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4435; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4436; of either 16, 32, or 64, it acts like the max shift size)
4437;
4438; @param 1 The instruction
4439;
4440; @param A0 Pointer to the destination media register size operand (output).
4441; @param A1 Pointer to the first source media register size operand (input).
4442; @param A2 Pointer to the second source media register size operand (input).
4443;
4444%macro IEMIMPL_SHIFT_OPT_F3 1
4445BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4446 PROLOGUE_3_ARGS
4447 IEMIMPL_AVX_PROLOGUE
4448
4449 vmovdqu xmm0, [A1]
4450 vmovdqu xmm1, [A2]
4451 %1 xmm0, xmm0, xmm1
4452 vmovdqu [A0], xmm0
4453
4454 IEMIMPL_AVX_PROLOGUE
4455 EPILOGUE_3_ARGS
4456ENDPROC iemAImpl_ %+ %1 %+ _u128
4457
4458BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4459 PROLOGUE_3_ARGS
4460 IEMIMPL_AVX_PROLOGUE
4461
4462 vmovdqu ymm0, [A1]
4463 vmovdqu xmm1, [A2]
4464 %1 ymm0, ymm0, xmm1
4465 vmovdqu [A0], ymm0
4466
4467 IEMIMPL_AVX_PROLOGUE
4468 EPILOGUE_3_ARGS
4469ENDPROC iemAImpl_ %+ %1 %+ _u256
4470%endmacro
4471
4472IEMIMPL_SHIFT_OPT_F3 vpsllw
4473IEMIMPL_SHIFT_OPT_F3 vpslld
4474IEMIMPL_SHIFT_OPT_F3 vpsllq
4475IEMIMPL_SHIFT_OPT_F3 vpsraw
4476IEMIMPL_SHIFT_OPT_F3 vpsrad
4477IEMIMPL_SHIFT_OPT_F3 vpsrlw
4478IEMIMPL_SHIFT_OPT_F3 vpsrld
4479IEMIMPL_SHIFT_OPT_F3 vpsrlq
4480
4481
4482;;
4483; Media instruction working on one full sized source registers and one destination (AVX),
4484; but no XSAVE state pointer argument.
4485;
4486; @param 1 The instruction
4487; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4488;
4489; @param A0 Pointer to the destination media register size operand (output).
4490; @param A1 Pointer to the source media register size operand (input).
4491;
4492%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4493BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4494 PROLOGUE_2_ARGS
4495 IEMIMPL_AVX_PROLOGUE
4496
4497 vmovdqu xmm0, [A1]
4498 %1 xmm0, xmm0
4499 vmovdqu [A0], xmm0
4500
4501 IEMIMPL_AVX_PROLOGUE
4502 EPILOGUE_2_ARGS
4503ENDPROC iemAImpl_ %+ %1 %+ _u128
4504
4505 %if %2 == 1
4506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4507 PROLOGUE_2_ARGS
4508 IEMIMPL_AVX_PROLOGUE
4509
4510 vmovdqu ymm0, [A1]
4511 %1 ymm0, ymm0
4512 vmovdqu [A0], ymm0
4513
4514 IEMIMPL_AVX_PROLOGUE
4515 EPILOGUE_2_ARGS
4516ENDPROC iemAImpl_ %+ %1 %+ _u256
4517 %endif
4518%endmacro
4519
4520IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4521IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4522IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4523IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4524
4525
4526;
4527; The SSE 4.2 crc32
4528;
4529; @param A1 Pointer to the 32-bit destination.
4530; @param A2 The source operand, sized according to the suffix.
4531;
4532BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4533 PROLOGUE_2_ARGS
4534
4535 mov T0_32, [A0]
4536 crc32 T0_32, A1_8
4537 mov [A0], T0_32
4538
4539 EPILOGUE_2_ARGS
4540ENDPROC iemAImpl_crc32_u8
4541
4542BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4543 PROLOGUE_2_ARGS
4544
4545 mov T0_32, [A0]
4546 crc32 T0_32, A1_16
4547 mov [A0], T0_32
4548
4549 EPILOGUE_2_ARGS
4550ENDPROC iemAImpl_crc32_u16
4551
4552BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4553 PROLOGUE_2_ARGS
4554
4555 mov T0_32, [A0]
4556 crc32 T0_32, A1_32
4557 mov [A0], T0_32
4558
4559 EPILOGUE_2_ARGS
4560ENDPROC iemAImpl_crc32_u32
4561
4562%ifdef RT_ARCH_AMD64
4563BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4564 PROLOGUE_2_ARGS
4565
4566 mov T0_32, [A0]
4567 crc32 T0, A1
4568 mov [A0], T0_32
4569
4570 EPILOGUE_2_ARGS
4571ENDPROC iemAImpl_crc32_u64
4572%endif
4573
4574
4575;
4576; PTEST (SSE 4.1)
4577;
4578; @param A0 Pointer to the first source operand (aka readonly destination).
4579; @param A1 Pointer to the second source operand.
4580; @param A2 Pointer to the EFLAGS register.
4581;
4582BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4583 PROLOGUE_3_ARGS
4584 IEMIMPL_SSE_PROLOGUE
4585
4586 movdqu xmm0, [A0]
4587 movdqu xmm1, [A1]
4588 ptest xmm0, xmm1
4589 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4590
4591 IEMIMPL_SSE_EPILOGUE
4592 EPILOGUE_3_ARGS
4593ENDPROC iemAImpl_ptest_u128
4594
4595BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4596 PROLOGUE_3_ARGS
4597 IEMIMPL_SSE_PROLOGUE
4598
4599 vmovdqu ymm0, [A0]
4600 vmovdqu ymm1, [A1]
4601 vptest ymm0, ymm1
4602 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4603
4604 IEMIMPL_SSE_EPILOGUE
4605 EPILOGUE_3_ARGS
4606ENDPROC iemAImpl_vptest_u256
4607
4608
4609;;
4610; Template for the [v]pmov{s,z}x* instructions
4611;
4612; @param 1 The instruction
4613;
4614; @param A0 Pointer to the destination media register size operand (output).
4615; @param A1 The source operand value (input).
4616;
4617%macro IEMIMPL_V_PMOV_SZ_X 1
4618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4619 PROLOGUE_2_ARGS
4620 IEMIMPL_SSE_PROLOGUE
4621
4622 movd xmm0, A1
4623 %1 xmm0, xmm0
4624 vmovdqu [A0], xmm0
4625
4626 IEMIMPL_SSE_PROLOGUE
4627 EPILOGUE_2_ARGS
4628ENDPROC iemAImpl_ %+ %1 %+ _u128
4629
4630BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4631 PROLOGUE_2_ARGS
4632 IEMIMPL_AVX_PROLOGUE
4633
4634 movd xmm0, A1
4635 v %+ %1 xmm0, xmm0
4636 vmovdqu [A0], xmm0
4637
4638 IEMIMPL_AVX_PROLOGUE
4639 EPILOGUE_2_ARGS
4640ENDPROC iemAImpl_v %+ %1 %+ _u128
4641
4642BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4643 PROLOGUE_2_ARGS
4644 IEMIMPL_AVX_PROLOGUE
4645
4646 movdqu xmm0, [A1]
4647 v %+ %1 ymm0, xmm0
4648 vmovdqu [A0], ymm0
4649
4650 IEMIMPL_AVX_PROLOGUE
4651 EPILOGUE_2_ARGS
4652ENDPROC iemAImpl_v %+ %1 %+ _u256
4653%endmacro
4654
4655IEMIMPL_V_PMOV_SZ_X pmovsxbw
4656IEMIMPL_V_PMOV_SZ_X pmovsxbd
4657IEMIMPL_V_PMOV_SZ_X pmovsxbq
4658IEMIMPL_V_PMOV_SZ_X pmovsxwd
4659IEMIMPL_V_PMOV_SZ_X pmovsxwq
4660IEMIMPL_V_PMOV_SZ_X pmovsxdq
4661
4662IEMIMPL_V_PMOV_SZ_X pmovzxbw
4663IEMIMPL_V_PMOV_SZ_X pmovzxbd
4664IEMIMPL_V_PMOV_SZ_X pmovzxbq
4665IEMIMPL_V_PMOV_SZ_X pmovzxwd
4666IEMIMPL_V_PMOV_SZ_X pmovzxwq
4667IEMIMPL_V_PMOV_SZ_X pmovzxdq
4668
4669
4670;;
4671; Need to move this as well somewhere better?
4672;
4673struc IEMSSERESULT
4674 .uResult resd 4
4675 .MXCSR resd 1
4676endstruc
4677
4678
4679;;
4680; Need to move this as well somewhere better?
4681;
4682struc IEMAVX128RESULT
4683 .uResult resd 4
4684 .MXCSR resd 1
4685endstruc
4686
4687
4688;;
4689; Need to move this as well somewhere better?
4690;
4691struc IEMAVX256RESULT
4692 .uResult resd 8
4693 .MXCSR resd 1
4694endstruc
4695
4696
4697;;
4698; Initialize the SSE MXCSR register using the guest value partially to
4699; account for rounding mode.
4700;
4701; @uses 4 bytes of stack to save the original value, T0.
4702; @param 1 Expression giving the address of the FXSTATE of the guest.
4703;
4704%macro SSE_LD_FXSTATE_MXCSR 1
4705 sub xSP, 4
4706
4707 stmxcsr [xSP]
4708 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4709 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4710 or T0_32, X86_MXCSR_XCPT_MASK
4711 sub xSP, 4
4712 mov [xSP], T0_32
4713 ldmxcsr [xSP]
4714 add xSP, 4
4715%endmacro
4716
4717
4718;;
4719; Restores the SSE MXCSR register with the original value.
4720;
4721; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4722; @param 1 Expression giving the address where to return the MXCSR value.
4723; @param 2 Expression giving the address of the FXSTATE of the guest.
4724;
4725; @note Restores the stack pointer.
4726;
4727%macro SSE_ST_FXSTATE_MXCSR 2
4728 sub xSP, 4
4729 stmxcsr [xSP]
4730 mov T0_32, [xSP]
4731 add xSP, 4
4732 ; Merge the status bits into the original MXCSR value.
4733 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4734 and T0_32, X86_MXCSR_XCPT_FLAGS
4735 or T0_32, T1_32
4736 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4737
4738 ldmxcsr [xSP]
4739 add xSP, 4
4740%endmacro
4741
4742
4743;;
4744; Initialize the SSE MXCSR register using the guest value partially to
4745; account for rounding mode.
4746;
4747; @uses 4 bytes of stack to save the original value.
4748; @param 1 Expression giving the address of the FXSTATE of the guest.
4749;
4750%macro AVX_LD_XSAVEAREA_MXCSR 1
4751 sub xSP, 4
4752
4753 stmxcsr [xSP]
4754 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4755 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4756 sub xSP, 4
4757 mov [xSP], T0_32
4758 ldmxcsr [xSP]
4759 add xSP, 4
4760%endmacro
4761
4762
4763;;
4764; Restores the AVX128 MXCSR register with the original value.
4765;
4766; @param 1 Expression giving the address where to return the MXCSR value.
4767;
4768; @note Restores the stack pointer.
4769;
4770%macro AVX128_ST_XSAVEAREA_MXCSR 1
4771 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4772
4773 ldmxcsr [xSP]
4774 add xSP, 4
4775%endmacro
4776
4777
4778;;
4779; Restores the AVX256 MXCSR register with the original value.
4780;
4781; @param 1 Expression giving the address where to return the MXCSR value.
4782;
4783; @note Restores the stack pointer.
4784;
4785%macro AVX256_ST_XSAVEAREA_MXCSR 1
4786 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4787
4788 ldmxcsr [xSP]
4789 add xSP, 4
4790%endmacro
4791
4792
4793;;
4794; Floating point instruction working on two full sized registers.
4795;
4796; @param 1 The instruction
4797; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4798;
4799; @param A0 FPU context (FXSTATE or XSAVEAREA).
4800; @param A1 Where to return the result including the MXCSR value.
4801; @param A2 Pointer to the first media register size operand (input/output).
4802; @param A3 Pointer to the second media register size operand (input).
4803;
4804%macro IEMIMPL_FP_F2 2
4805BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4806 PROLOGUE_4_ARGS
4807 IEMIMPL_SSE_PROLOGUE
4808 SSE_LD_FXSTATE_MXCSR A0
4809
4810 movdqu xmm0, [A2]
4811 movdqu xmm1, [A3]
4812 %1 xmm0, xmm1
4813 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4814
4815 SSE_ST_FXSTATE_MXCSR A1, A0
4816 IEMIMPL_SSE_PROLOGUE
4817 EPILOGUE_4_ARGS
4818ENDPROC iemAImpl_ %+ %1 %+ _u128
4819
4820 %if %2 == 3
4821BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4822 PROLOGUE_4_ARGS
4823 IEMIMPL_AVX_PROLOGUE
4824 AVX_LD_XSAVEAREA_MXCSR A0
4825
4826 vmovdqu xmm0, [A2]
4827 vmovdqu xmm1, [A3]
4828 v %+ %1 xmm0, xmm0, xmm1
4829 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4830
4831 AVX128_ST_XSAVEAREA_MXCSR A1
4832 IEMIMPL_AVX_PROLOGUE
4833 EPILOGUE_4_ARGS
4834ENDPROC iemAImpl_v %+ %1 %+ _u128
4835
4836BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4837 PROLOGUE_4_ARGS
4838 IEMIMPL_AVX_PROLOGUE
4839 AVX_LD_XSAVEAREA_MXCSR A0
4840
4841 vmovdqu ymm0, [A2]
4842 vmovdqu ymm1, [A3]
4843 v %+ %1 ymm0, ymm0, ymm1
4844 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4845
4846 AVX256_ST_XSAVEAREA_MXCSR A1
4847 IEMIMPL_AVX_PROLOGUE
4848 EPILOGUE_4_ARGS
4849ENDPROC iemAImpl_v %+ %1 %+ _u256
4850 %elif %2 == 2
4851BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4852 PROLOGUE_4_ARGS
4853 IEMIMPL_AVX_PROLOGUE
4854 AVX_LD_XSAVEAREA_MXCSR A0
4855
4856 vmovdqu xmm0, [A2]
4857 vmovdqu xmm1, [A3]
4858 v %+ %1 xmm0, xmm1
4859 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4860
4861 AVX128_ST_XSAVEAREA_MXCSR A1
4862 IEMIMPL_AVX_PROLOGUE
4863 EPILOGUE_4_ARGS
4864ENDPROC iemAImpl_v %+ %1 %+ _u128
4865
4866BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4867 PROLOGUE_4_ARGS
4868 IEMIMPL_AVX_PROLOGUE
4869 AVX_LD_XSAVEAREA_MXCSR A0
4870
4871 vmovdqu ymm0, [A2]
4872 vmovdqu ymm1, [A3]
4873 v %+ %1 ymm0, ymm1
4874 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4875
4876 AVX256_ST_XSAVEAREA_MXCSR A1
4877 IEMIMPL_AVX_PROLOGUE
4878 EPILOGUE_4_ARGS
4879ENDPROC iemAImpl_v %+ %1 %+ _u256
4880 %endif
4881%endmacro
4882
4883IEMIMPL_FP_F2 addps, 3
4884IEMIMPL_FP_F2 addpd, 3
4885IEMIMPL_FP_F2 mulps, 3
4886IEMIMPL_FP_F2 mulpd, 3
4887IEMIMPL_FP_F2 subps, 3
4888IEMIMPL_FP_F2 subpd, 3
4889IEMIMPL_FP_F2 minps, 3
4890IEMIMPL_FP_F2 minpd, 3
4891IEMIMPL_FP_F2 divps, 3
4892IEMIMPL_FP_F2 divpd, 3
4893IEMIMPL_FP_F2 maxps, 3
4894IEMIMPL_FP_F2 maxpd, 3
4895IEMIMPL_FP_F2 haddps, 3
4896IEMIMPL_FP_F2 haddpd, 3
4897IEMIMPL_FP_F2 hsubps, 3
4898IEMIMPL_FP_F2 hsubpd, 3
4899IEMIMPL_FP_F2 addsubps, 3
4900IEMIMPL_FP_F2 addsubpd, 3
4901
4902
4903;;
4904; These are actually unary operations but to keep it simple
4905; we treat them as binary for now, so the output result is
4906; always in sync with the register where the result might get written
4907; to.
4908IEMIMPL_FP_F2 sqrtps, 2
4909IEMIMPL_FP_F2 rsqrtps, 2
4910IEMIMPL_FP_F2 sqrtpd, 2
4911IEMIMPL_FP_F2 rcpps, 2
4912IEMIMPL_FP_F2 cvtdq2ps, 2
4913IEMIMPL_FP_F2 cvtps2dq, 2
4914IEMIMPL_FP_F2 cvttps2dq, 2
4915IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4916IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4917IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4918
4919
4920;;
4921; Floating point instruction working on a full sized register and a single precision operand.
4922;
4923; @param 1 The instruction
4924;
4925; @param A0 FPU context (FXSTATE or XSAVEAREA).
4926; @param A1 Where to return the result including the MXCSR value.
4927; @param A2 Pointer to the first media register size operand (input/output).
4928; @param A3 Pointer to the second single precision floating point value (input).
4929;
4930%macro IEMIMPL_FP_F2_R32 1
4931BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4932 PROLOGUE_4_ARGS
4933 IEMIMPL_SSE_PROLOGUE
4934 SSE_LD_FXSTATE_MXCSR A0
4935
4936 movdqu xmm0, [A2]
4937 movd xmm1, [A3]
4938 %1 xmm0, xmm1
4939 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4940
4941 SSE_ST_FXSTATE_MXCSR A1, A0
4942 IEMIMPL_SSE_EPILOGUE
4943 EPILOGUE_4_ARGS
4944ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4945
4946BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4947 PROLOGUE_4_ARGS
4948 IEMIMPL_AVX_PROLOGUE
4949 AVX_LD_XSAVEAREA_MXCSR A0
4950
4951 vmovdqu xmm0, [A2]
4952 vmovd xmm1, [A3]
4953 v %+ %1 xmm0, xmm0, xmm1
4954 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4955
4956 AVX128_ST_XSAVEAREA_MXCSR A1
4957 IEMIMPL_AVX_PROLOGUE
4958 EPILOGUE_4_ARGS
4959ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4960%endmacro
4961
4962IEMIMPL_FP_F2_R32 addss
4963IEMIMPL_FP_F2_R32 mulss
4964IEMIMPL_FP_F2_R32 subss
4965IEMIMPL_FP_F2_R32 minss
4966IEMIMPL_FP_F2_R32 divss
4967IEMIMPL_FP_F2_R32 maxss
4968IEMIMPL_FP_F2_R32 cvtss2sd
4969IEMIMPL_FP_F2_R32 sqrtss
4970IEMIMPL_FP_F2_R32 rsqrtss
4971IEMIMPL_FP_F2_R32 rcpss
4972
4973
4974;;
4975; Floating point instruction working on a full sized register and a double precision operand.
4976;
4977; @param 1 The instruction
4978;
4979; @param A0 FPU context (FXSTATE or XSAVEAREA).
4980; @param A1 Where to return the result including the MXCSR value.
4981; @param A2 Pointer to the first media register size operand (input/output).
4982; @param A3 Pointer to the second double precision floating point value (input).
4983;
4984%macro IEMIMPL_FP_F2_R64 1
4985BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4986 PROLOGUE_4_ARGS
4987 IEMIMPL_SSE_PROLOGUE
4988 SSE_LD_FXSTATE_MXCSR A0
4989
4990 movdqu xmm0, [A2]
4991 movq xmm1, [A3]
4992 %1 xmm0, xmm1
4993 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4994
4995 SSE_ST_FXSTATE_MXCSR A1, A0
4996 IEMIMPL_SSE_EPILOGUE
4997 EPILOGUE_4_ARGS
4998ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4999
5000BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5001 PROLOGUE_4_ARGS
5002 IEMIMPL_AVX_PROLOGUE
5003 AVX_LD_XSAVEAREA_MXCSR A0
5004
5005 vmovdqu xmm0, [A2]
5006 vmovq xmm1, [A3]
5007 v %+ %1 xmm0, xmm0, xmm1
5008 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5009
5010 AVX128_ST_XSAVEAREA_MXCSR A1
5011 IEMIMPL_AVX_EPILOGUE
5012 EPILOGUE_4_ARGS
5013ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5014%endmacro
5015
5016IEMIMPL_FP_F2_R64 addsd
5017IEMIMPL_FP_F2_R64 mulsd
5018IEMIMPL_FP_F2_R64 subsd
5019IEMIMPL_FP_F2_R64 minsd
5020IEMIMPL_FP_F2_R64 divsd
5021IEMIMPL_FP_F2_R64 maxsd
5022IEMIMPL_FP_F2_R64 cvtsd2ss
5023IEMIMPL_FP_F2_R64 sqrtsd
5024
5025
5026;;
5027; Macro for the cvtpd2ps/cvtps2pd instructions.
5028;
5029; 1 The instruction name.
5030; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5031;
5032; @param A0 FPU context (FXSTATE or XSAVEAREA).
5033; @param A1 Where to return the result including the MXCSR value.
5034; @param A2 Pointer to the first media register size operand (input/output).
5035; @param A3 Pointer to the second media register size operand (input).
5036;
5037%macro IEMIMPL_CVT_F2 2
5038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5039 PROLOGUE_4_ARGS
5040 IEMIMPL_SSE_PROLOGUE
5041 SSE_LD_FXSTATE_MXCSR A0
5042
5043 movdqu xmm0, [A2]
5044 movdqu xmm1, [A3]
5045 %1 xmm0, xmm1
5046 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5047
5048 SSE_ST_FXSTATE_MXCSR A1, A0
5049 IEMIMPL_SSE_EPILOGUE
5050 EPILOGUE_4_ARGS
5051ENDPROC iemAImpl_ %+ %1 %+ _u128
5052
5053BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5054 PROLOGUE_4_ARGS
5055 IEMIMPL_AVX_PROLOGUE
5056 AVX_LD_XSAVEAREA_MXCSR A0
5057
5058 vmovdqu xmm0, [A2]
5059 vmovdqu xmm1, [A3]
5060 v %+ %1 xmm0, xmm1
5061 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5062
5063 AVX128_ST_XSAVEAREA_MXCSR A1
5064 IEMIMPL_AVX_EPILOGUE
5065 EPILOGUE_4_ARGS
5066ENDPROC iemAImpl_v %+ %1 %+ _u128
5067
5068BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5069 PROLOGUE_4_ARGS
5070 IEMIMPL_AVX_PROLOGUE
5071 AVX_LD_XSAVEAREA_MXCSR A0
5072
5073 vmovdqu ymm0, [A2]
5074 vmovdqu ymm1, [A3]
5075 %if %2 == 0
5076 v %+ %1 xmm0, ymm1
5077 %else
5078 v %+ %1 ymm0, xmm1
5079 %endif
5080 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5081
5082 AVX256_ST_XSAVEAREA_MXCSR A1
5083 IEMIMPL_AVX_EPILOGUE
5084 EPILOGUE_4_ARGS
5085ENDPROC iemAImpl_v %+ %1 %+ _u256
5086%endmacro
5087
5088IEMIMPL_CVT_F2 cvtpd2ps, 0
5089IEMIMPL_CVT_F2 cvtps2pd, 1
5090
5091
5092;;
5093; shufps instructions with 8-bit immediates.
5094;
5095; @param A0 Pointer to the destination media register size operand (input/output).
5096; @param A1 Pointer to the first source media register size operand (input).
5097; @param A2 The 8-bit immediate
5098;
5099BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5100 PROLOGUE_3_ARGS
5101 IEMIMPL_SSE_PROLOGUE
5102
5103 movzx A2, A2_8 ; must clear top bits
5104 movdqu xmm0, [A0]
5105 movdqu xmm1, [A1]
5106 lea T1, [.imm0 xWrtRIP]
5107 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5108 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5109 %else
5110 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5111 %endif
5112 lea T1, [T1 + T0*2]
5113 IBT_NOTRACK
5114 call T1
5115 movdqu [A0], xmm0
5116
5117 IEMIMPL_SSE_EPILOGUE
5118 EPILOGUE_3_ARGS
5119 %assign bImm 0
5120 %rep 256
5121.imm %+ bImm:
5122 IBT_ENDBRxx_WITHOUT_NOTRACK
5123 shufps xmm0, xmm1, bImm
5124 ret
5125 int3
5126 %assign bImm bImm + 1
5127 %endrep
5128.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5129ENDPROC iemAImpl_shufps_u128
5130
5131
5132;;
5133; shufpd instruction with 8-bit immediates.
5134;
5135; @param A0 Pointer to the destination media register size operand (input/output).
5136; @param A1 Pointer to the first source media register size operand (input).
5137; @param A2 The 8-bit immediate
5138;
5139BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5140 PROLOGUE_3_ARGS
5141 IEMIMPL_SSE_PROLOGUE
5142
5143 movzx A2, A2_8 ; must clear top bits
5144 movdqu xmm0, [A0]
5145 movdqu xmm1, [A1]
5146 lea T1, [.imm0 xWrtRIP]
5147 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5148 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5149 %else
5150 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5151 %endif
5152 lea T1, [T1 + T0*2]
5153 IBT_NOTRACK
5154 call T1
5155 movdqu [A0], xmm0
5156
5157 IEMIMPL_SSE_EPILOGUE
5158 EPILOGUE_3_ARGS
5159 %assign bImm 0
5160 %rep 256
5161.imm %+ bImm:
5162 IBT_ENDBRxx_WITHOUT_NOTRACK
5163 shufpd xmm0, xmm1, bImm
5164 ret
5165 %assign bImm bImm + 1
5166 %endrep
5167.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5168ENDPROC iemAImpl_shufpd_u128
5169
5170
5171;;
5172; vshufp{s,d} instructions with 8-bit immediates.
5173;
5174; @param 1 The instruction name.
5175;
5176; @param A0 Pointer to the destination media register size operand (output).
5177; @param A1 Pointer to the first source media register size operand (input).
5178; @param A2 Pointer to the second source media register size operand (input).
5179; @param A3 The 8-bit immediate
5180;
5181%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5182BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5183 PROLOGUE_4_ARGS
5184 IEMIMPL_AVX_PROLOGUE
5185
5186 movzx A3, A3_8 ; must clear top bits
5187 movdqu xmm0, [A1]
5188 movdqu xmm1, [A2]
5189 lea T1, [.imm0 xWrtRIP]
5190 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5191 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5192 %else
5193 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5194 %endif
5195 lea T1, [T1 + T0*2]
5196 IBT_NOTRACK
5197 call T1
5198 movdqu [A0], xmm0
5199
5200 IEMIMPL_AVX_EPILOGUE
5201 EPILOGUE_4_ARGS
5202 %assign bImm 0
5203 %rep 256
5204.imm %+ bImm:
5205 IBT_ENDBRxx_WITHOUT_NOTRACK
5206 %1 xmm0, xmm0, xmm1, bImm
5207 ret
5208 %assign bImm bImm + 1
5209 %endrep
5210.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5211ENDPROC iemAImpl_ %+ %1 %+ _u128
5212
5213BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5214 PROLOGUE_4_ARGS
5215 IEMIMPL_AVX_PROLOGUE
5216
5217 movzx A3, A3_8 ; must clear top bits
5218 vmovdqu ymm0, [A1]
5219 vmovdqu ymm1, [A2]
5220 lea T1, [.imm0 xWrtRIP]
5221 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5222 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5223 %else
5224 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5225 %endif
5226 lea T1, [T1 + T0*2]
5227 IBT_NOTRACK
5228 call T1
5229 vmovdqu [A0], ymm0
5230
5231 IEMIMPL_AVX_EPILOGUE
5232 EPILOGUE_4_ARGS
5233 %assign bImm 0
5234 %rep 256
5235.imm %+ bImm:
5236 IBT_ENDBRxx_WITHOUT_NOTRACK
5237 %1 ymm0, ymm0, ymm1, bImm
5238 ret
5239 %assign bImm bImm + 1
5240 %endrep
5241.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5242ENDPROC iemAImpl_ %+ %1 %+ _u256
5243%endmacro
5244
5245IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5246IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5247
5248
5249;;
5250; One of the [p]blendv{b,ps,pd} variants
5251;
5252; @param 1 The instruction
5253;
5254; @param A0 Pointer to the first media register sized operand (input/output).
5255; @param A1 Pointer to the second media sized value (input).
5256; @param A2 Pointer to the media register sized mask value (input).
5257;
5258%macro IEMIMPL_P_BLEND 1
5259BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5260 PROLOGUE_3_ARGS
5261 IEMIMPL_SSE_PROLOGUE
5262
5263 movdqu xmm0, [A2] ; This is implicit
5264 movdqu xmm1, [A0]
5265 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5266 %1 xmm1, xmm2
5267 movdqu [A0], xmm1
5268
5269 IEMIMPL_SSE_PROLOGUE
5270 EPILOGUE_3_ARGS
5271ENDPROC iemAImpl_ %+ %1 %+ _u128
5272%endmacro
5273
5274IEMIMPL_P_BLEND pblendvb
5275IEMIMPL_P_BLEND blendvps
5276IEMIMPL_P_BLEND blendvpd
5277
5278
5279;;
5280; One of the v[p]blendv{b,ps,pd} variants
5281;
5282; @param 1 The instruction
5283;
5284; @param A0 Pointer to the first media register sized operand (output).
5285; @param A1 Pointer to the first media register sized operand (input).
5286; @param A2 Pointer to the second media register sized operand (input).
5287; @param A3 Pointer to the media register sized mask value (input).
5288%macro IEMIMPL_AVX_P_BLEND 1
5289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5290 PROLOGUE_4_ARGS
5291 IEMIMPL_AVX_PROLOGUE
5292
5293 vmovdqu xmm0, [A1]
5294 vmovdqu xmm1, [A2]
5295 vmovdqu xmm2, [A3]
5296 %1 xmm0, xmm0, xmm1, xmm2
5297 vmovdqu [A0], xmm0
5298
5299 IEMIMPL_AVX_PROLOGUE
5300 EPILOGUE_4_ARGS
5301ENDPROC iemAImpl_ %+ %1 %+ _u128
5302
5303BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5304 PROLOGUE_4_ARGS
5305 IEMIMPL_AVX_PROLOGUE
5306
5307 vmovdqu ymm0, [A1]
5308 vmovdqu ymm1, [A2]
5309 vmovdqu ymm2, [A3]
5310 %1 ymm0, ymm0, ymm1, ymm2
5311 vmovdqu [A0], ymm0
5312
5313 IEMIMPL_AVX_PROLOGUE
5314 EPILOGUE_4_ARGS
5315ENDPROC iemAImpl_ %+ %1 %+ _u256
5316%endmacro
5317
5318IEMIMPL_AVX_P_BLEND vpblendvb
5319IEMIMPL_AVX_P_BLEND vblendvps
5320IEMIMPL_AVX_P_BLEND vblendvpd
5321
5322
5323;;
5324; palignr mm1, mm2/m64 instruction.
5325;
5326; @param A0 Pointer to the first media register sized operand (output).
5327; @param A1 The second register sized operand (input).
5328; @param A2 The 8-bit immediate.
5329BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5330 PROLOGUE_3_ARGS
5331 IEMIMPL_MMX_PROLOGUE
5332
5333 movzx A2, A2_8 ; must clear top bits
5334 movq mm0, [A0]
5335 movq mm1, A1
5336 lea T1, [.imm0 xWrtRIP]
5337 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5338 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5339 %else
5340 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5341 %endif
5342 lea T1, [T1 + T0*2]
5343 IBT_NOTRACK
5344 call T1
5345 movq [A0], mm0
5346
5347 IEMIMPL_MMX_EPILOGUE
5348 EPILOGUE_3_ARGS
5349 %assign bImm 0
5350 %rep 256
5351.imm %+ bImm:
5352 IBT_ENDBRxx_WITHOUT_NOTRACK
5353 palignr mm0, mm1, bImm
5354 ret
5355 %assign bImm bImm + 1
5356 %endrep
5357.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5358ENDPROC iemAImpl_palignr_u64
5359
5360
5361;;
5362; SSE instructions with 8-bit immediates of the form
5363; xxx xmm1, xmm2, imm8.
5364; where the instruction encoding takes up 6 bytes.
5365;
5366; @param 1 The instruction name.
5367;
5368; @param A0 Pointer to the first media register size operand (input/output).
5369; @param A1 Pointer to the second source media register size operand (input).
5370; @param A2 The 8-bit immediate
5371;
5372%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5373BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5374 PROLOGUE_3_ARGS
5375 IEMIMPL_SSE_PROLOGUE
5376
5377 movzx A2, A2_8 ; must clear top bits
5378 movdqu xmm0, [A0]
5379 movdqu xmm1, [A1]
5380 lea T1, [.imm0 xWrtRIP]
5381 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5382 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5383 lea T1, [T1 + T0*4]
5384 %else
5385 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5386 %endif
5387 IBT_NOTRACK
5388 call T1
5389 movdqu [A0], xmm0
5390
5391 IEMIMPL_SSE_EPILOGUE
5392 EPILOGUE_3_ARGS
5393 %assign bImm 0
5394 %rep 256
5395.imm %+ bImm:
5396 IBT_ENDBRxx_WITHOUT_NOTRACK
5397 %1 xmm0, xmm1, bImm
5398 ret
5399 int3
5400 %assign bImm bImm + 1
5401 %endrep
5402.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5403ENDPROC iemAImpl_ %+ %1 %+ _u128
5404%endmacro
5405
5406IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5407IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5408IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5409IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5410IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5411IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5412IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5413
5414
5415;;
5416; AVX instructions with 8-bit immediates of the form
5417; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5418; where the instruction encoding takes up 6 bytes.
5419;
5420; @param 1 The instruction name.
5421; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5422; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5423;
5424; @param A0 Pointer to the destination media register size operand (output).
5425; @param A1 Pointer to the first source media register size operand (input).
5426; @param A2 Pointer to the second source media register size operand (input).
5427; @param A3 The 8-bit immediate
5428;
5429%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5430 %if %2 == 1
5431BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5432 PROLOGUE_4_ARGS
5433 IEMIMPL_AVX_PROLOGUE
5434
5435 movzx A3, A3_8 ; must clear top bits
5436 movdqu xmm0, [A1]
5437 movdqu xmm1, [A2]
5438 lea T1, [.imm0 xWrtRIP]
5439 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5440 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5441 lea T1, [T1 + T0*4]
5442 %else
5443 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5444 %endif
5445 IBT_NOTRACK
5446 call T1
5447 movdqu [A0], xmm0
5448
5449 IEMIMPL_AVX_EPILOGUE
5450 EPILOGUE_4_ARGS
5451 %assign bImm 0
5452 %rep 256
5453.imm %+ bImm:
5454 IBT_ENDBRxx_WITHOUT_NOTRACK
5455 %1 xmm0, xmm0, xmm1, bImm
5456 ret
5457 int3
5458 %assign bImm bImm + 1
5459 %endrep
5460.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5461ENDPROC iemAImpl_ %+ %1 %+ _u128
5462 %endif
5463
5464 %if %3 == 1
5465BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5466 PROLOGUE_4_ARGS
5467 IEMIMPL_AVX_PROLOGUE
5468
5469 movzx A3, A3_8 ; must clear top bits
5470 vmovdqu ymm0, [A1]
5471 vmovdqu ymm1, [A2]
5472 lea T1, [.imm0 xWrtRIP]
5473 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5474 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5475 lea T1, [T1 + T0*4]
5476 %else
5477 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5478 %endif
5479 IBT_NOTRACK
5480 call T1
5481 vmovdqu [A0], ymm0
5482
5483 IEMIMPL_AVX_EPILOGUE
5484 EPILOGUE_4_ARGS
5485 %assign bImm 0
5486 %rep 256
5487.imm %+ bImm:
5488 IBT_ENDBRxx_WITHOUT_NOTRACK
5489 %1 ymm0, ymm0, ymm1, bImm
5490 ret
5491 int3
5492 %assign bImm bImm + 1
5493 %endrep
5494.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5495ENDPROC iemAImpl_ %+ %1 %+ _u256
5496 %endif
5497%endmacro
5498
5499IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5500IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5501IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5502IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5503IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5504IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5505IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5506IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5507
5508
5509;;
5510; AVX instructions with 8-bit immediates of the form
5511; xxx {x,y}mm1, {x,y}mm2, imm8.
5512; where the instruction encoding takes up 6 bytes.
5513;
5514; @param 1 The instruction name.
5515; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5516; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5517;
5518; @param A0 Pointer to the destination media register size operand (output).
5519; @param A1 Pointer to the first source media register size operand (input).
5520; @param A2 The 8-bit immediate
5521;
5522%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 3
5523 %if %2 == 1
5524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5525 PROLOGUE_4_ARGS
5526 IEMIMPL_AVX_PROLOGUE
5527
5528 movzx A2, A2_8 ; must clear top bits
5529 movdqu xmm1, [A1]
5530 lea T1, [.imm0 xWrtRIP]
5531 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5532 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5533 lea T1, [T1 + T0*4]
5534 %else
5535 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5536 %endif
5537 IBT_NOTRACK
5538 call T1
5539 movdqu [A0], xmm0
5540
5541 IEMIMPL_AVX_EPILOGUE
5542 EPILOGUE_4_ARGS
5543 %assign bImm 0
5544 %rep 256
5545.imm %+ bImm:
5546 IBT_ENDBRxx_WITHOUT_NOTRACK
5547 %1 xmm0, xmm1, bImm
5548 ret
5549 int3
5550 %assign bImm bImm + 1
5551 %endrep
5552.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5553ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5554 %endif
5555
5556 %if %3 == 1
5557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5558 PROLOGUE_4_ARGS
5559 IEMIMPL_AVX_PROLOGUE
5560
5561 movzx A2, A2_8 ; must clear top bits
5562 vmovdqu ymm1, [A1]
5563 lea T1, [.imm0 xWrtRIP]
5564 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5565 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5566 lea T1, [T1 + T0*4]
5567 %else
5568 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5569 %endif
5570 IBT_NOTRACK
5571 call T1
5572 vmovdqu [A0], ymm0
5573
5574 IEMIMPL_AVX_EPILOGUE
5575 EPILOGUE_4_ARGS
5576 %assign bImm 0
5577 %rep 256
5578.imm %+ bImm:
5579 IBT_ENDBRxx_WITHOUT_NOTRACK
5580 %1 ymm0, ymm1, bImm
5581 ret
5582 int3
5583 %assign bImm bImm + 1
5584 %endrep
5585.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5586ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5587 %endif
5588%endmacro
5589
5590IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilps, 1, 1
5591IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilpd, 1, 1
5592
5593
5594;;
5595; Need to move this as well somewhere better?
5596;
5597struc IEMPCMPISTRXSRC
5598 .uSrc1 resd 4
5599 .uSrc2 resd 4
5600endstruc
5601
5602struc IEMPCMPESTRXSRC
5603 .uSrc1 resd 4
5604 .uSrc2 resd 4
5605 .u64Rax resd 2
5606 .u64Rdx resd 2
5607endstruc
5608
5609;;
5610; The pcmpistri instruction.
5611;
5612; @param A0 Pointer to the ECX register to store the result to (output).
5613; @param A1 Pointer to the EFLAGS register.
5614; @param A2 Pointer to the structure containing the source operands (input).
5615; @param A3 The 8-bit immediate
5616;
5617BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5618 PROLOGUE_4_ARGS
5619 IEMIMPL_SSE_PROLOGUE
5620
5621 movzx A3, A3_8 ; must clear top bits
5622 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5623 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5624 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5625 lea T1, [.imm0 xWrtRIP]
5626 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5627 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5628 lea T1, [T1 + T0*4]
5629 %else
5630 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5631 %endif
5632 IBT_NOTRACK
5633 call T1
5634
5635 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5636 mov [T2], ecx
5637
5638 IEMIMPL_SSE_EPILOGUE
5639 EPILOGUE_4_ARGS
5640 %assign bImm 0
5641 %rep 256
5642.imm %+ bImm:
5643 IBT_ENDBRxx_WITHOUT_NOTRACK
5644 pcmpistri xmm0, xmm1, bImm
5645 ret
5646 int3
5647 %assign bImm bImm + 1
5648 %endrep
5649.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5650ENDPROC iemAImpl_pcmpistri_u128
5651
5652;;
5653; The pcmpestri instruction.
5654;
5655; @param A0 Pointer to the ECX register to store the result to (output).
5656; @param A1 Pointer to the EFLAGS register.
5657; @param A2 Pointer to the structure containing the source operands (input).
5658; @param A3 The 8-bit immediate
5659;
5660BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5661 PROLOGUE_4_ARGS
5662 IEMIMPL_SSE_PROLOGUE
5663
5664 movzx A3, A3_8 ; must clear top bits
5665 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5666 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5667 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5668 lea T1, [.imm0 xWrtRIP]
5669 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5670 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5671 lea T1, [T1 + T0*4]
5672 %else
5673 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5674 %endif
5675 push xDX ; xDX can be A1 or A2 depending on the calling convention
5676 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5677 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5678 IBT_NOTRACK
5679 call T1
5680
5681 pop xDX
5682 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5683 mov [T2], ecx
5684
5685 IEMIMPL_SSE_EPILOGUE
5686 EPILOGUE_4_ARGS
5687 %assign bImm 0
5688 %rep 256
5689.imm %+ bImm:
5690 IBT_ENDBRxx_WITHOUT_NOTRACK
5691 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5692 pcmpestri xmm0, xmm1, bImm
5693 ret
5694 %assign bImm bImm + 1
5695 %endrep
5696.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5697ENDPROC iemAImpl_pcmpestri_u128
5698
5699;;
5700; The pcmpistrm instruction template.
5701;
5702; @param A0 Pointer to the XMM0 register to store the result to (output).
5703; @param A1 Pointer to the EFLAGS register.
5704; @param A2 Pointer to the structure containing the source operands (input).
5705; @param A3 The 8-bit immediate
5706;
5707BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5708 PROLOGUE_4_ARGS
5709 IEMIMPL_SSE_PROLOGUE
5710
5711 movzx A3, A3_8 ; must clear top bits
5712 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5713 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5714 lea T1, [.imm0 xWrtRIP]
5715 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5716 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5717 lea T1, [T1 + T0*4]
5718 %else
5719 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5720 %endif
5721 IBT_NOTRACK
5722 call T1
5723
5724 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5725 movdqu [A0], xmm0
5726
5727 IEMIMPL_SSE_EPILOGUE
5728 EPILOGUE_4_ARGS
5729 %assign bImm 0
5730 %rep 256
5731.imm %+ bImm:
5732 IBT_ENDBRxx_WITHOUT_NOTRACK
5733 pcmpistrm xmm1, xmm2, bImm
5734 ret
5735 int3
5736 %assign bImm bImm + 1
5737 %endrep
5738.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5739ENDPROC iemAImpl_pcmpistrm_u128
5740
5741;;
5742; The pcmpestrm instruction template.
5743;
5744; @param A0 Pointer to the XMM0 register to store the result to (output).
5745; @param A1 Pointer to the EFLAGS register.
5746; @param A2 Pointer to the structure containing the source operands (input).
5747; @param A3 The 8-bit immediate
5748;
5749BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5750 PROLOGUE_4_ARGS
5751 IEMIMPL_SSE_PROLOGUE
5752
5753 movzx A3, A3_8 ; must clear top bits
5754 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5755 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5756 lea T1, [.imm0 xWrtRIP]
5757 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5758 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5759 lea T1, [T1 + T0*4]
5760 %else
5761 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5762 %endif
5763 push xDX ; xDX can be A1 or A2 depending on the calling convention
5764 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5765 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5766 IBT_NOTRACK
5767 call T1
5768
5769 pop xDX
5770 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5771 movdqu [A0], xmm0
5772
5773 IEMIMPL_SSE_EPILOGUE
5774 EPILOGUE_4_ARGS
5775 %assign bImm 0
5776 %rep 256
5777.imm %+ bImm:
5778 IBT_ENDBRxx_WITHOUT_NOTRACK
5779 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5780 pcmpestrm xmm1, xmm2, bImm
5781 ret
5782 %assign bImm bImm + 1
5783 %endrep
5784.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5785ENDPROC iemAImpl_pcmpestrm_u128
5786
5787
5788;;
5789; pinsrw instruction.
5790;
5791; @param A0 Pointer to the first media register size operand (input/output).
5792; @param A1 The 16 bit input operand (input).
5793; @param A2 The 8-bit immediate
5794;
5795BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5796 PROLOGUE_3_ARGS
5797 IEMIMPL_SSE_PROLOGUE
5798
5799 movzx A2, A2_8 ; must clear top bits
5800 movq mm0, [A0]
5801 lea T1, [.imm0 xWrtRIP]
5802 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5803 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5804 %else
5805 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5806 %endif
5807 lea T1, [T1 + T0]
5808 IBT_NOTRACK
5809 call T1
5810 movq [A0], mm0
5811
5812 IEMIMPL_SSE_EPILOGUE
5813 EPILOGUE_3_ARGS
5814 %assign bImm 0
5815 %rep 256
5816.imm %+ bImm:
5817 IBT_ENDBRxx_WITHOUT_NOTRACK
5818 pinsrw mm0, A1_32, bImm
5819 ret
5820 %assign bImm bImm + 1
5821 %endrep
5822.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5823ENDPROC iemAImpl_pinsrw_u64
5824
5825BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5826 PROLOGUE_3_ARGS
5827 IEMIMPL_SSE_PROLOGUE
5828
5829 movzx A2, A2_8 ; must clear top bits
5830 movdqu xmm0, [A0]
5831 lea T1, [.imm0 xWrtRIP]
5832 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5833 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5834 %else
5835 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5836 %endif
5837 lea T1, [T1 + T0*2]
5838 IBT_NOTRACK
5839 call T1
5840 movdqu [A0], xmm0
5841
5842 IEMIMPL_SSE_EPILOGUE
5843 EPILOGUE_3_ARGS
5844 %assign bImm 0
5845 %rep 256
5846.imm %+ bImm:
5847 IBT_ENDBRxx_WITHOUT_NOTRACK
5848 pinsrw xmm0, A1_32, bImm
5849 ret
5850 %assign bImm bImm + 1
5851 %endrep
5852.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5853ENDPROC iemAImpl_pinsrw_u128
5854
5855;;
5856; vpinsrw instruction.
5857;
5858; @param A0 Pointer to the first media register size operand (output).
5859; @param A1 Pointer to the source media register size operand (input).
5860; @param A2 The 16 bit input operand (input).
5861; @param A3 The 8-bit immediate
5862;
5863BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5864 PROLOGUE_4_ARGS
5865 IEMIMPL_SSE_PROLOGUE
5866
5867 movzx A3, A3_8 ; must clear top bits
5868 movdqu xmm0, [A1]
5869 lea T1, [.imm0 xWrtRIP]
5870 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5871 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5872 %else
5873 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5874 %endif
5875 lea T1, [T1 + T0*2]
5876 mov A1, A2 ; A2 requires longer encoding on Windows
5877 IBT_NOTRACK
5878 call T1
5879 movdqu [A0], xmm0
5880
5881 IEMIMPL_SSE_EPILOGUE
5882 EPILOGUE_4_ARGS
5883 %assign bImm 0
5884 %rep 256
5885.imm %+ bImm:
5886 IBT_ENDBRxx_WITHOUT_NOTRACK
5887 vpinsrw xmm0, xmm0, A1_32, bImm
5888 ret
5889 %assign bImm bImm + 1
5890 %endrep
5891.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5892ENDPROC iemAImpl_vpinsrw_u128
5893
5894
5895;;
5896; pextrw instruction.
5897;
5898; @param A0 Pointer to the 16bit output operand (output).
5899; @param A1 Pointer to the media register size operand (input).
5900; @param A2 The 8-bit immediate
5901;
5902BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5903 PROLOGUE_3_ARGS
5904 IEMIMPL_SSE_PROLOGUE
5905
5906 movzx A2, A2_8 ; must clear top bits
5907 movq mm0, A1
5908 lea T1, [.imm0 xWrtRIP]
5909 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5910 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5911 %else
5912 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5913 %endif
5914 lea T1, [T1 + T0]
5915 IBT_NOTRACK
5916 call T1
5917 mov word [A0], T0_16
5918
5919 IEMIMPL_SSE_EPILOGUE
5920 EPILOGUE_3_ARGS
5921 %assign bImm 0
5922 %rep 256
5923.imm %+ bImm:
5924 IBT_ENDBRxx_WITHOUT_NOTRACK
5925 pextrw T0_32, mm0, bImm
5926 ret
5927 %assign bImm bImm + 1
5928 %endrep
5929.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5930ENDPROC iemAImpl_pextrw_u64
5931
5932BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5933 PROLOGUE_3_ARGS
5934 IEMIMPL_SSE_PROLOGUE
5935
5936 movzx A2, A2_8 ; must clear top bits
5937 movdqu xmm0, [A1]
5938 lea T1, [.imm0 xWrtRIP]
5939 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5940 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5941 %else
5942 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5943 %endif
5944 lea T1, [T1 + T0*2]
5945 IBT_NOTRACK
5946 call T1
5947 mov word [A0], T0_16
5948
5949 IEMIMPL_SSE_EPILOGUE
5950 EPILOGUE_3_ARGS
5951 %assign bImm 0
5952 %rep 256
5953.imm %+ bImm:
5954 IBT_ENDBRxx_WITHOUT_NOTRACK
5955 pextrw T0_32, xmm0, bImm
5956 ret
5957 %assign bImm bImm + 1
5958 %endrep
5959.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5960ENDPROC iemAImpl_pextrw_u128
5961
5962;;
5963; vpextrw instruction.
5964;
5965; @param A0 Pointer to the 16bit output operand (output).
5966; @param A1 Pointer to the source media register size operand (input).
5967; @param A2 The 8-bit immediate
5968;
5969BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5970 PROLOGUE_3_ARGS
5971 IEMIMPL_SSE_PROLOGUE
5972
5973 movzx A2, A2_8 ; must clear top bits
5974 movdqu xmm0, [A1]
5975 lea T1, [.imm0 xWrtRIP]
5976 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5977 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5978 %else
5979 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5980 %endif
5981 lea T1, [T1 + T0*2]
5982 IBT_NOTRACK
5983 call T1
5984 mov word [A0], T0_16
5985
5986 IEMIMPL_SSE_EPILOGUE
5987 EPILOGUE_3_ARGS
5988 %assign bImm 0
5989 %rep 256
5990.imm %+ bImm:
5991 IBT_ENDBRxx_WITHOUT_NOTRACK
5992 vpextrw T0_32, xmm0, bImm
5993 ret
5994 %assign bImm bImm + 1
5995 %endrep
5996.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5997ENDPROC iemAImpl_vpextrw_u128
5998
5999
6000;;
6001; movmskp{s,d} SSE instruction template
6002;
6003; @param 1 The SSE instruction name.
6004; @param 2 The AVX instruction name.
6005;
6006; @param A0 Pointer to the output register (output/byte sized).
6007; @param A1 Pointer to the source media register size operand (input).
6008;
6009%macro IEMIMPL_MEDIA_MOVMSK_P 2
6010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6011 PROLOGUE_2_ARGS
6012 IEMIMPL_SSE_PROLOGUE
6013
6014 movdqu xmm0, [A1]
6015 %1 T0, xmm0
6016 mov byte [A0], T0_8
6017
6018 IEMIMPL_SSE_EPILOGUE
6019 EPILOGUE_2_ARGS
6020ENDPROC iemAImpl_ %+ %1 %+ _u128
6021
6022BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6023 PROLOGUE_2_ARGS
6024 IEMIMPL_AVX_PROLOGUE
6025
6026 movdqu xmm0, [A1]
6027 %2 T0, xmm0
6028 mov byte [A0], T0_8
6029
6030 IEMIMPL_AVX_EPILOGUE
6031 EPILOGUE_2_ARGS
6032ENDPROC iemAImpl_ %+ %2 %+ _u128
6033
6034BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6035 PROLOGUE_2_ARGS
6036 IEMIMPL_AVX_PROLOGUE
6037
6038 vmovdqu ymm0, [A1]
6039 %2 T0, ymm0
6040 mov byte [A0], T0_8
6041
6042 IEMIMPL_AVX_EPILOGUE
6043 EPILOGUE_2_ARGS
6044ENDPROC iemAImpl_ %+ %2 %+ _u256
6045%endmacro
6046
6047IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6048IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6049
6050
6051;;
6052; Restores the SSE MXCSR register with the original value.
6053;
6054; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6055; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6056; @param 2 Expression giving the address of the FXSTATE of the guest.
6057;
6058; @note Restores the stack pointer.
6059;
6060%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
6061 sub xSP, 4
6062 stmxcsr [xSP]
6063 mov T0_32, [xSP]
6064 add xSP, 4
6065 ; Merge the status bits into the original MXCSR value.
6066 mov T1_32, [%2 + X86FXSTATE.MXCSR]
6067 and T0_32, X86_MXCSR_XCPT_FLAGS
6068 or T0_32, T1_32
6069 mov [%1], T0_32
6070
6071 ldmxcsr [xSP]
6072 add xSP, 4
6073%endmacro
6074
6075
6076;;
6077; cvttsd2si instruction - 32-bit variant.
6078;
6079; @param A0 FPU context (FXSTATE or XSAVEAREA).
6080; @param A1 Where to return the MXCSR value.
6081; @param A2 Pointer to the result operand (output).
6082; @param A3 Pointer to the second operand (input).
6083;
6084BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6085 PROLOGUE_4_ARGS
6086 IEMIMPL_SSE_PROLOGUE
6087 SSE_LD_FXSTATE_MXCSR A0
6088
6089 cvttsd2si T0_32, [A3]
6090 mov dword [A2], T0_32
6091
6092 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6093 IEMIMPL_SSE_EPILOGUE
6094 EPILOGUE_4_ARGS
6095ENDPROC iemAImpl_cvttsd2si_i32_r64
6096
6097;;
6098; cvttsd2si instruction - 64-bit variant.
6099;
6100; @param A0 FPU context (FXSTATE or XSAVEAREA).
6101; @param A1 Where to return the MXCSR value.
6102; @param A2 Pointer to the result operand (output).
6103; @param A3 Pointer to the second operand (input).
6104;
6105BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6106 PROLOGUE_4_ARGS
6107 IEMIMPL_SSE_PROLOGUE
6108 SSE_LD_FXSTATE_MXCSR A0
6109
6110 cvttsd2si T0, [A3]
6111 mov qword [A2], T0
6112
6113 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6114 IEMIMPL_SSE_EPILOGUE
6115 EPILOGUE_4_ARGS
6116ENDPROC iemAImpl_cvttsd2si_i64_r64
6117
6118
6119;;
6120; cvtsd2si instruction - 32-bit variant.
6121;
6122; @param A0 FPU context (FXSTATE or XSAVEAREA).
6123; @param A1 Where to return the MXCSR value.
6124; @param A2 Pointer to the result operand (output).
6125; @param A3 Pointer to the second operand (input).
6126;
6127BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6128 PROLOGUE_4_ARGS
6129 IEMIMPL_SSE_PROLOGUE
6130 SSE_LD_FXSTATE_MXCSR A0
6131
6132 cvtsd2si T0_32, [A3]
6133 mov dword [A2], T0_32
6134
6135 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6136 IEMIMPL_SSE_EPILOGUE
6137 EPILOGUE_4_ARGS
6138ENDPROC iemAImpl_cvtsd2si_i32_r64
6139
6140;;
6141; cvtsd2si instruction - 64-bit variant.
6142;
6143; @param A0 FPU context (FXSTATE or XSAVEAREA).
6144; @param A1 Where to return the MXCSR value.
6145; @param A2 Pointer to the result operand (output).
6146; @param A3 Pointer to the second operand (input).
6147;
6148BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6149 PROLOGUE_4_ARGS
6150 IEMIMPL_SSE_PROLOGUE
6151 SSE_LD_FXSTATE_MXCSR A0
6152
6153 cvtsd2si T0, [A3]
6154 mov qword [A2], T0
6155
6156 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6157 IEMIMPL_SSE_EPILOGUE
6158 EPILOGUE_4_ARGS
6159ENDPROC iemAImpl_cvtsd2si_i64_r64
6160
6161
6162;;
6163; cvttss2si instruction - 32-bit variant.
6164;
6165; @param A0 FPU context (FXSTATE or XSAVEAREA).
6166; @param A1 Where to return the MXCSR value.
6167; @param A2 Pointer to the result operand (output).
6168; @param A3 Pointer to the second operand (input).
6169;
6170BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6171 PROLOGUE_4_ARGS
6172 IEMIMPL_SSE_PROLOGUE
6173 SSE_LD_FXSTATE_MXCSR A0
6174
6175 cvttss2si T0_32, [A3]
6176 mov dword [A2], T0_32
6177
6178 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6179 IEMIMPL_SSE_EPILOGUE
6180 EPILOGUE_4_ARGS
6181ENDPROC iemAImpl_cvttss2si_i32_r32
6182
6183;;
6184; cvttss2si instruction - 64-bit variant.
6185;
6186; @param A0 FPU context (FXSTATE or XSAVEAREA).
6187; @param A1 Where to return the MXCSR value.
6188; @param A2 Pointer to the result operand (output).
6189; @param A3 Pointer to the second operand (input).
6190;
6191BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6192 PROLOGUE_4_ARGS
6193 IEMIMPL_SSE_PROLOGUE
6194 SSE_LD_FXSTATE_MXCSR A0
6195
6196 cvttss2si T0, [A3]
6197 mov qword [A2], T0
6198
6199 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6200 IEMIMPL_SSE_EPILOGUE
6201 EPILOGUE_4_ARGS
6202ENDPROC iemAImpl_cvttss2si_i64_r32
6203
6204
6205;;
6206; cvtss2si instruction - 32-bit variant.
6207;
6208; @param A0 FPU context (FXSTATE or XSAVEAREA).
6209; @param A1 Where to return the MXCSR value.
6210; @param A2 Pointer to the result operand (output).
6211; @param A3 Pointer to the second operand (input).
6212;
6213BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6214 PROLOGUE_4_ARGS
6215 IEMIMPL_SSE_PROLOGUE
6216 SSE_LD_FXSTATE_MXCSR A0
6217
6218 cvtss2si T0_32, [A3]
6219 mov dword [A2], T0_32
6220
6221 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6222 IEMIMPL_SSE_EPILOGUE
6223 EPILOGUE_4_ARGS
6224ENDPROC iemAImpl_cvtss2si_i32_r32
6225
6226;;
6227; cvtss2si instruction - 64-bit variant.
6228;
6229; @param A0 FPU context (FXSTATE or XSAVEAREA).
6230; @param A1 Where to return the MXCSR value.
6231; @param A2 Pointer to the result operand (output).
6232; @param A3 Pointer to the second operand (input).
6233;
6234BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6235 PROLOGUE_4_ARGS
6236 IEMIMPL_SSE_PROLOGUE
6237 SSE_LD_FXSTATE_MXCSR A0
6238
6239 cvtss2si T0, [A3]
6240 mov qword [A2], T0
6241
6242 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6243 IEMIMPL_SSE_EPILOGUE
6244 EPILOGUE_4_ARGS
6245ENDPROC iemAImpl_cvtss2si_i64_r32
6246
6247
6248;;
6249; cvtsi2ss instruction - 32-bit variant.
6250;
6251; @param A0 FPU context (FXSTATE or XSAVEAREA).
6252; @param A1 Where to return the MXCSR value.
6253; @param A2 Pointer to the result operand (output).
6254; @param A3 Pointer to the second operand (input).
6255;
6256BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6257 PROLOGUE_4_ARGS
6258 IEMIMPL_SSE_PROLOGUE
6259 SSE_LD_FXSTATE_MXCSR A0
6260
6261 cvtsi2ss xmm0, dword [A3]
6262 movd dword [A2], xmm0
6263
6264 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6265 IEMIMPL_SSE_EPILOGUE
6266 EPILOGUE_4_ARGS
6267ENDPROC iemAImpl_cvtsi2ss_r32_i32
6268
6269;;
6270; cvtsi2ss instruction - 64-bit variant.
6271;
6272; @param A0 FPU context (FXSTATE or XSAVEAREA).
6273; @param A1 Where to return the MXCSR value.
6274; @param A2 Pointer to the result operand (output).
6275; @param A3 Pointer to the second operand (input).
6276;
6277BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6278 PROLOGUE_4_ARGS
6279 IEMIMPL_SSE_PROLOGUE
6280 SSE_LD_FXSTATE_MXCSR A0
6281
6282 cvtsi2ss xmm0, qword [A3]
6283 movd dword [A2], xmm0
6284
6285 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6286 IEMIMPL_SSE_EPILOGUE
6287 EPILOGUE_4_ARGS
6288ENDPROC iemAImpl_cvtsi2ss_r32_i64
6289
6290
6291;;
6292; cvtsi2sd instruction - 32-bit variant.
6293;
6294; @param A0 FPU context (FXSTATE or XSAVEAREA).
6295; @param A1 Where to return the MXCSR value.
6296; @param A2 Pointer to the result operand (output).
6297; @param A3 Pointer to the second operand (input).
6298;
6299BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6300 PROLOGUE_4_ARGS
6301 IEMIMPL_SSE_PROLOGUE
6302 SSE_LD_FXSTATE_MXCSR A0
6303
6304 cvtsi2sd xmm0, dword [A3]
6305 movq [A2], xmm0
6306
6307 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6308 IEMIMPL_SSE_EPILOGUE
6309 EPILOGUE_4_ARGS
6310ENDPROC iemAImpl_cvtsi2sd_r64_i32
6311
6312;;
6313; cvtsi2sd instruction - 64-bit variant.
6314;
6315; @param A0 FPU context (FXSTATE or XSAVEAREA).
6316; @param A1 Where to return the MXCSR value.
6317; @param A2 Pointer to the result operand (output).
6318; @param A3 Pointer to the second operand (input).
6319;
6320BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6321 PROLOGUE_4_ARGS
6322 IEMIMPL_SSE_PROLOGUE
6323 SSE_LD_FXSTATE_MXCSR A0
6324
6325 cvtsi2sd xmm0, qword [A3]
6326 movq [A2], xmm0
6327
6328 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6329 IEMIMPL_SSE_EPILOGUE
6330 EPILOGUE_4_ARGS
6331ENDPROC iemAImpl_cvtsi2sd_r64_i64
6332
6333
6334;;
6335; Initialize the SSE MXCSR register using the guest value partially to
6336; account for rounding mode.
6337;
6338; @uses 4 bytes of stack to save the original value, T0.
6339; @param 1 Expression giving the address of the MXCSR register of the guest.
6340;
6341%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6342 sub xSP, 4
6343
6344 stmxcsr [xSP]
6345 mov T0_32, [%1]
6346 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6347 or T0_32, X86_MXCSR_XCPT_MASK
6348 sub xSP, 4
6349 mov [xSP], T0_32
6350 ldmxcsr [xSP]
6351 add xSP, 4
6352%endmacro
6353
6354
6355;;
6356; Restores the SSE MXCSR register with the original value.
6357;
6358; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6359; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6360;
6361; @note Restores the stack pointer.
6362;
6363%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6364 sub xSP, 4
6365 stmxcsr [xSP]
6366 mov T0_32, [xSP]
6367 add xSP, 4
6368 ; Merge the status bits into the original MXCSR value.
6369 mov T1_32, [%1]
6370 and T0_32, X86_MXCSR_XCPT_FLAGS
6371 or T0_32, T1_32
6372 mov [%1], T0_32
6373
6374 ldmxcsr [xSP]
6375 add xSP, 4
6376%endmacro
6377
6378
6379;
6380; UCOMISS (SSE)
6381;
6382; @param A0 Pointer to the MXCSR value (input/output).
6383; @param A1 Pointer to the EFLAGS value (input/output).
6384; @param A2 Pointer to the first source operand (aka readonly destination).
6385; @param A3 Pointer to the second source operand.
6386;
6387BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6388 PROLOGUE_4_ARGS
6389 IEMIMPL_SSE_PROLOGUE
6390 SSE_LD_FXSTATE_MXCSR_ONLY A0
6391
6392 movdqu xmm0, [A2]
6393 movdqu xmm1, [A3]
6394 ucomiss xmm0, xmm1
6395 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6396
6397 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6398 IEMIMPL_SSE_EPILOGUE
6399 EPILOGUE_4_ARGS
6400ENDPROC iemAImpl_ucomiss_u128
6401
6402BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6403 PROLOGUE_4_ARGS
6404 IEMIMPL_SSE_PROLOGUE
6405 SSE_LD_FXSTATE_MXCSR_ONLY A0
6406
6407 movdqu xmm0, [A2]
6408 movdqu xmm1, [A3]
6409 vucomiss xmm0, xmm1
6410 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6411
6412 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6413 IEMIMPL_SSE_EPILOGUE
6414 EPILOGUE_4_ARGS
6415ENDPROC iemAImpl_vucomiss_u128
6416
6417
6418;
6419; UCOMISD (SSE)
6420;
6421; @param A0 Pointer to the MXCSR value (input/output).
6422; @param A1 Pointer to the EFLAGS value (input/output).
6423; @param A2 Pointer to the first source operand (aka readonly destination).
6424; @param A3 Pointer to the second source operand.
6425;
6426BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6427 PROLOGUE_4_ARGS
6428 IEMIMPL_SSE_PROLOGUE
6429 SSE_LD_FXSTATE_MXCSR_ONLY A0
6430
6431 movdqu xmm0, [A2]
6432 movdqu xmm1, [A3]
6433 ucomisd xmm0, xmm1
6434 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6435
6436 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6437 IEMIMPL_SSE_EPILOGUE
6438 EPILOGUE_4_ARGS
6439ENDPROC iemAImpl_ucomisd_u128
6440
6441BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6442 PROLOGUE_4_ARGS
6443 IEMIMPL_SSE_PROLOGUE
6444 SSE_LD_FXSTATE_MXCSR_ONLY A0
6445
6446 movdqu xmm0, [A2]
6447 movdqu xmm1, [A3]
6448 vucomisd xmm0, xmm1
6449 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6450
6451 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6452 IEMIMPL_SSE_EPILOGUE
6453 EPILOGUE_4_ARGS
6454ENDPROC iemAImpl_vucomisd_u128
6455
6456;
6457; COMISS (SSE)
6458;
6459; @param A0 Pointer to the MXCSR value (input/output).
6460; @param A1 Pointer to the EFLAGS value (input/output).
6461; @param A2 Pointer to the first source operand (aka readonly destination).
6462; @param A3 Pointer to the second source operand.
6463;
6464BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6465 PROLOGUE_4_ARGS
6466 IEMIMPL_SSE_PROLOGUE
6467 SSE_LD_FXSTATE_MXCSR_ONLY A0
6468
6469 movdqu xmm0, [A2]
6470 movdqu xmm1, [A3]
6471 comiss xmm0, xmm1
6472 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6473
6474 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6475 IEMIMPL_SSE_EPILOGUE
6476 EPILOGUE_4_ARGS
6477ENDPROC iemAImpl_comiss_u128
6478
6479BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6480 PROLOGUE_4_ARGS
6481 IEMIMPL_SSE_PROLOGUE
6482 SSE_LD_FXSTATE_MXCSR_ONLY A0
6483
6484 movdqu xmm0, [A2]
6485 movdqu xmm1, [A3]
6486 vcomiss xmm0, xmm1
6487 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6488
6489 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6490 IEMIMPL_SSE_EPILOGUE
6491 EPILOGUE_4_ARGS
6492ENDPROC iemAImpl_vcomiss_u128
6493
6494
6495;
6496; COMISD (SSE)
6497;
6498; @param A0 Pointer to the MXCSR value (input/output).
6499; @param A1 Pointer to the EFLAGS value (input/output).
6500; @param A2 Pointer to the first source operand (aka readonly destination).
6501; @param A3 Pointer to the second source operand.
6502;
6503BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6504 PROLOGUE_4_ARGS
6505 IEMIMPL_SSE_PROLOGUE
6506 SSE_LD_FXSTATE_MXCSR_ONLY A0
6507
6508 movdqu xmm0, [A2]
6509 movdqu xmm1, [A3]
6510 comisd xmm0, xmm1
6511 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6512
6513 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6514 IEMIMPL_SSE_EPILOGUE
6515 EPILOGUE_4_ARGS
6516ENDPROC iemAImpl_comisd_u128
6517
6518BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6519 PROLOGUE_4_ARGS
6520 IEMIMPL_SSE_PROLOGUE
6521 SSE_LD_FXSTATE_MXCSR_ONLY A0
6522
6523 movdqu xmm0, [A2]
6524 movdqu xmm1, [A3]
6525 vcomisd xmm0, xmm1
6526 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6527
6528 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6529 IEMIMPL_SSE_EPILOGUE
6530 EPILOGUE_4_ARGS
6531ENDPROC iemAImpl_vcomisd_u128
6532
6533
6534;;
6535; Need to move this as well somewhere better?
6536;
6537struc IEMMEDIAF2XMMSRC
6538 .uSrc1 resd 4
6539 .uSrc2 resd 4
6540endstruc
6541
6542
6543;
6544; CMPPS (SSE)
6545;
6546; @param A0 Pointer to the MXCSR value (input/output).
6547; @param A1 Pointer to the first media register size operand (output).
6548; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6549; @param A3 The 8-bit immediate (input).
6550;
6551BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6552 PROLOGUE_4_ARGS
6553 IEMIMPL_SSE_PROLOGUE
6554 SSE_LD_FXSTATE_MXCSR_ONLY A0
6555
6556 movzx A3, A3_8 ; must clear top bits
6557 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6558 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6559 lea T1, [.imm0 xWrtRIP]
6560 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6561 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6562 %else
6563 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6564 %endif
6565 lea T1, [T1 + T0]
6566 IBT_NOTRACK
6567 call T1
6568 movdqu [A1], xmm0
6569
6570 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6571 IEMIMPL_SSE_EPILOGUE
6572 EPILOGUE_4_ARGS
6573 %assign bImm 0
6574 %rep 256
6575.imm %+ bImm:
6576 IBT_ENDBRxx_WITHOUT_NOTRACK
6577 cmpps xmm0, xmm1, bImm
6578 ret
6579 %assign bImm bImm + 1
6580 %endrep
6581.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6582ENDPROC iemAImpl_cmpps_u128
6583
6584;;
6585; SSE instructions with 8-bit immediates of the form
6586; xxx xmm1, xmm2, imm8.
6587; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6588; register.
6589;
6590; @param 1 The instruction name.
6591;
6592; @param A0 Pointer to the MXCSR value (input/output).
6593; @param A1 Pointer to the first media register size operand (output).
6594; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6595; @param A3 The 8-bit immediate (input).
6596;
6597%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6598BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6599 PROLOGUE_4_ARGS
6600 IEMIMPL_SSE_PROLOGUE
6601 SSE_LD_FXSTATE_MXCSR_ONLY A0
6602
6603 movzx A3, A3_8 ; must clear top bits
6604 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6605 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6606 lea T1, [.imm0 xWrtRIP]
6607 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6608 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6609 %else
6610 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6611 %endif
6612 lea T1, [T1 + T0*2]
6613 IBT_NOTRACK
6614 call T1
6615 movdqu [A1], xmm0
6616
6617 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6618 IEMIMPL_SSE_EPILOGUE
6619 EPILOGUE_4_ARGS
6620 %assign bImm 0
6621 %rep 256
6622.imm %+ bImm:
6623 IBT_ENDBRxx_WITHOUT_NOTRACK
6624 %1 xmm0, xmm1, bImm
6625 ret
6626 %assign bImm bImm + 1
6627 %endrep
6628.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6629ENDPROC iemAImpl_ %+ %1 %+ _u128
6630%endmacro
6631
6632IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6633IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6634IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6635
6636;;
6637; SSE instructions with 8-bit immediates of the form
6638; xxx xmm1, xmm2, imm8.
6639; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6640; register.
6641;
6642; @param 1 The instruction name.
6643;
6644; @param A0 Pointer to the MXCSR value (input/output).
6645; @param A1 Pointer to the first media register size operand (output).
6646; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6647; @param A3 The 8-bit immediate (input).
6648;
6649%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6651 PROLOGUE_4_ARGS
6652 IEMIMPL_SSE_PROLOGUE
6653 SSE_LD_FXSTATE_MXCSR_ONLY A0
6654
6655 movzx A3, A3_8 ; must clear top bits
6656 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6657 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6658 lea T1, [.imm0 xWrtRIP]
6659 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6660 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6661 lea T1, [T1 + T0*4]
6662 %else
6663 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6664 %endif
6665 IBT_NOTRACK
6666 call T1
6667 movdqu [A1], xmm0
6668
6669 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6670 IEMIMPL_SSE_EPILOGUE
6671 EPILOGUE_4_ARGS
6672 %assign bImm 0
6673 %rep 256
6674.imm %+ bImm:
6675 IBT_ENDBRxx_WITHOUT_NOTRACK
6676 %1 xmm0, xmm1, bImm
6677 ret
6678 int3
6679 %assign bImm bImm + 1
6680 %endrep
6681.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6682ENDPROC iemAImpl_ %+ %1 %+ _u128
6683%endmacro
6684
6685IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6686IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6687IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6688IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6689IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6690IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6691
6692
6693;;
6694; SSE instructions of the form
6695; xxx mm, xmm.
6696; and we need to load and save the MXCSR register.
6697;
6698; @param 1 The instruction name.
6699;
6700; @param A0 Pointer to the MXCSR value (input/output).
6701; @param A1 Pointer to the first MMX register sized operand (output).
6702; @param A2 Pointer to the media register sized operand (input).
6703;
6704%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6706 PROLOGUE_3_ARGS
6707 IEMIMPL_SSE_PROLOGUE
6708 SSE_LD_FXSTATE_MXCSR_ONLY A0
6709
6710 movdqu xmm0, [A2]
6711 %1 mm0, xmm0
6712 movq [A1], mm0
6713
6714 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6715 IEMIMPL_SSE_EPILOGUE
6716 EPILOGUE_3_ARGS
6717ENDPROC iemAImpl_ %+ %1 %+ _u128
6718%endmacro
6719
6720IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6721IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6722
6723;;
6724; SSE instructions of the form
6725; xxx xmm, xmm/m64.
6726; and we need to load and save the MXCSR register.
6727;
6728; @param 1 The instruction name.
6729;
6730; @param A0 Pointer to the MXCSR value (input/output).
6731; @param A1 Pointer to the first media register sized operand (input/output).
6732; @param A2 The 64bit source value from a MMX media register (input)
6733;
6734%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6736 PROLOGUE_3_ARGS
6737 IEMIMPL_SSE_PROLOGUE
6738 SSE_LD_FXSTATE_MXCSR_ONLY A0
6739
6740 movdqu xmm0, [A1]
6741 movq mm0, A2
6742 %1 xmm0, mm0
6743 movdqu [A1], xmm0
6744
6745 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6746 IEMIMPL_SSE_EPILOGUE
6747 EPILOGUE_3_ARGS
6748ENDPROC iemAImpl_ %+ %1 %+ _u128
6749%endmacro
6750
6751IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6752IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6753
6754;;
6755; SSE instructions of the form
6756; xxx mm, xmm/m64.
6757; and we need to load and save the MXCSR register.
6758;
6759; @param 1 The instruction name.
6760;
6761; @param A0 Pointer to the MXCSR value (input/output).
6762; @param A1 Pointer to the first MMX media register sized operand (output).
6763; @param A2 The 64bit source value (input).
6764;
6765%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6766BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6767 PROLOGUE_3_ARGS
6768 IEMIMPL_SSE_PROLOGUE
6769 SSE_LD_FXSTATE_MXCSR_ONLY A0
6770
6771 movq xmm0, A2
6772 %1 mm0, xmm0
6773 movq [A1], mm0
6774
6775 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6776 IEMIMPL_SSE_EPILOGUE
6777 EPILOGUE_3_ARGS
6778ENDPROC iemAImpl_ %+ %1 %+ _u128
6779%endmacro
6780
6781IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6782IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6783
6784;
6785; All forms of RDRAND and RDSEED
6786;
6787; @param A0 Pointer to the destination operand.
6788; @param A1 Pointer to the EFLAGS value (input/output).
6789;
6790%macro IEMIMPL_RDRAND_RDSEED 3
6791BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6792 PROLOGUE_2_ARGS
6793
6794 %1 %2
6795 mov [A0], %2
6796 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6797
6798 EPILOGUE_2_ARGS
6799ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6800%endmacro
6801
6802IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6803IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6804IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6805IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6806IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6807IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6808
6809
6810;;
6811; sha1rnds4 xmm1, xmm2, imm8.
6812;
6813; @param 1 The instruction name.
6814;
6815; @param A0 Pointer to the first media register size operand (input/output).
6816; @param A1 Pointer to the second source media register size operand (input).
6817; @param A2 The 8-bit immediate
6818;
6819BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6820 PROLOGUE_3_ARGS
6821 IEMIMPL_SSE_PROLOGUE
6822
6823 movzx A2, A2_8 ; must clear top bits
6824 movdqu xmm0, [A0]
6825 movdqu xmm1, [A1]
6826 lea T1, [.imm0 xWrtRIP]
6827 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6828 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6829 %else
6830 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6831 %endif
6832 lea T1, [T1 + T0*2]
6833 IBT_NOTRACK
6834 call T1
6835 movdqu [A0], xmm0
6836
6837 IEMIMPL_SSE_EPILOGUE
6838 EPILOGUE_3_ARGS
6839 %assign bImm 0
6840 %rep 256
6841.imm %+ bImm:
6842 IBT_ENDBRxx_WITHOUT_NOTRACK
6843 sha1rnds4 xmm0, xmm1, bImm
6844 ret
6845 %assign bImm bImm + 1
6846 %endrep
6847.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6848ENDPROC iemAImpl_sha1rnds4_u128
6849
6850
6851;;
6852; sha256rnds2 xmm1, xmm2, <XMM0>.
6853;
6854; @param 1 The instruction name.
6855;
6856; @param A0 Pointer to the first media register size operand (input/output).
6857; @param A1 Pointer to the second source media register size operand (input).
6858; @param A2 Pointer to the implicit XMM0 constants (input).
6859;
6860BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6861 PROLOGUE_3_ARGS
6862 IEMIMPL_SSE_PROLOGUE
6863
6864 movdqu xmm0, [A2]
6865 movdqu xmm1, [A0]
6866 movdqu xmm2, [A1]
6867 sha256rnds2 xmm1, xmm2
6868 movdqu [A0], xmm1
6869
6870 IEMIMPL_SSE_EPILOGUE
6871 EPILOGUE_3_ARGS
6872ENDPROC iemAImpl_sha256rnds2_u128
6873
6874
6875;
6876; 32-bit forms of ADCX and ADOX
6877;
6878; @param A0 Pointer to the destination operand (input/output).
6879; @param A1 32-bit source operand 1 (input).
6880; @param A2 Pointer to the EFLAGS value (input/output).
6881;
6882%macro IEMIMPL_ADX_32 2
6883BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6884 PROLOGUE_4_ARGS
6885
6886 IEM_LOAD_FLAGS A2, %2, 0
6887 %1 A1_32, [A0]
6888 mov [A0], A1_32
6889 IEM_SAVE_FLAGS A2, %2, 0
6890
6891 EPILOGUE_4_ARGS
6892ENDPROC iemAImpl_ %+ %1 %+ _u32
6893%endmacro
6894
6895;
6896; 64-bit forms of ADCX and ADOX
6897;
6898; @param A0 Pointer to the destination operand (input/output).
6899; @param A1 64-bit source operand 1 (input).
6900; @param A2 Pointer to the EFLAGS value (input/output).
6901;
6902%macro IEMIMPL_ADX_64 2
6903BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6904 PROLOGUE_4_ARGS
6905
6906 IEM_LOAD_FLAGS A2, %2, 0
6907 %1 A1, [A0]
6908 mov [A0], A1
6909 IEM_SAVE_FLAGS A2, %2, 0
6910
6911 EPILOGUE_4_ARGS
6912ENDPROC iemAImpl_ %+ %1 %+ _u64
6913%endmacro
6914
6915IEMIMPL_ADX_32 adcx, X86_EFL_CF
6916IEMIMPL_ADX_64 adcx, X86_EFL_CF
6917
6918IEMIMPL_ADX_32 adox, X86_EFL_OF
6919IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette