VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 103852

Last change on this file since 103852 was 103735, checked in by vboxsync, 9 months ago

VMM/IEM: Implement vpsrlv[dq], vpsravd, vpsllv[dq] instruction dispatch & emulation, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 198.6 KB
Line 
1; $Id: IEMAllAImpl.asm 103735 2024-03-08 05:15:24Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283; @param 4 Force loading the flags.
284;
285%macro IEM_MAYBE_LOAD_FLAGS 3-4 1
286 %if (%3 + %4) != 0
287 pushf ; store current flags
288 mov T0_32, [%1] ; load the guest flags
289 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
290 and T0_32, (%2 | %3) ; select the modified and undefined flags.
291 or [xSP], T0 ; merge guest flags with host flags.
292 popf ; load the mixed flags.
293 %endif
294%endmacro
295
296;;
297; Load the relevant flags from [%1].
298;
299; @remarks Clobbers T0, stack. Changes EFLAGS.
300; @param A2 The register pointing to the flags.
301; @param 1 The parameter (A0..A3) pointing to the eflags.
302; @param 2 The set of flags to load.
303; @param 3 The set of undefined flags.
304;
305%macro IEM_LOAD_FLAGS 3
306 pushf ; store current flags
307 mov T0_32, [%1] ; load the guest flags
308 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
309 and T0_32, (%2 | %3) ; select the modified and undefined flags.
310 or [xSP], T0 ; merge guest flags with host flags.
311 popf ; load the mixed flags.
312%endmacro
313
314;;
315; Update the flag.
316;
317; @remarks Clobbers T0, T1, stack.
318; @param 1 The register pointing to the EFLAGS.
319; @param 2 The mask of modified flags to save.
320; @param 3 The mask of undefined flags to (maybe) save.
321;
322%macro IEM_SAVE_FLAGS 3
323 %if (%2 | %3) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; flags
327 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
328 and T1_32, (%2 | %3) ; select the modified and undefined flags.
329 or T0_32, T1_32 ; combine the flags.
330 mov [%1], T0_32 ; save the flags.
331 %endif
332%endmacro
333
334;;
335; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
336;
337; @remarks Clobbers T0, T1, stack.
338; @param 1 The register pointing to the EFLAGS.
339; @param 2 The mask of modified flags to save.
340; @param 3 Mask of additional flags to always clear
341; @param 4 Mask of additional flags to always set.
342;
343%macro IEM_SAVE_AND_ADJUST_FLAGS 4
344 %if (%2 | %3 | %4) != 0
345 pushf
346 pop T1
347 mov T0_32, [%1] ; load flags.
348 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
349 and T1_32, (%2) ; select the modified flags.
350 or T0_32, T1_32 ; combine the flags.
351 %if (%4) != 0
352 or T0_32, %4 ; add the always set flags.
353 %endif
354 mov [%1], T0_32 ; save the result.
355 %endif
356%endmacro
357
358;;
359; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
360; signed input (%4[%5]) and parity index (%6).
361;
362; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
363; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
364; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
365;
366; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
367; @param 1 The register pointing to the EFLAGS.
368; @param 2 The mask of modified flags to save.
369; @param 3 Mask of additional flags to always clear
370; @param 4 The result register to set SF by.
371; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
372; @param 6 The (full) register containing the parity table index. Will be modified!
373
374%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
375 %ifdef RT_ARCH_AMD64
376 pushf
377 pop T2
378 %else
379 push T0
380 pushf
381 pop T0
382 %endif
383 mov T1_32, [%1] ; load flags.
384 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
385 %ifdef RT_ARCH_AMD64
386 and T2_32, (%2) ; select the modified flags.
387 or T1_32, T2_32 ; combine the flags.
388 %else
389 and T0_32, (%2) ; select the modified flags.
390 or T1_32, T0_32 ; combine the flags.
391 pop T0
392 %endif
393
394 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
395 bt %4, %5 - 1
396 jnc %%sf_clear
397 or T1_32, X86_EFL_SF
398 %%sf_clear:
399
400 ; Parity last.
401 and %6, 0xff
402 %ifdef RT_ARCH_AMD64
403 lea T2, [NAME(g_afParity) xWrtRIP]
404 or T1_8, [T2 + %6]
405 %else
406 or T1_8, [NAME(g_afParity) + %6]
407 %endif
408
409 mov [%1], T1_32 ; save the result.
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419;
420%macro IEM_ADJUST_FLAGS 3
421 %if (%2 | %3) != 0
422 mov T0_32, [%1] ; Load flags.
423 %if (%2) != 0
424 and T0_32, ~(%2) ; Remove the always cleared flags.
425 %endif
426 %if (%3) != 0
427 or T0_32, %3 ; Add the always set flags.
428 %endif
429 mov [%1], T0_32 ; Save the result.
430 %endif
431%endmacro
432
433;;
434; Calculates the new EFLAGS using fixed clear and set bit masks.
435;
436; @remarks Clobbers T0, %4, EFLAGS.
437; @param 1 The register pointing to the EFLAGS.
438; @param 2 Mask of additional flags to always clear
439; @param 3 Mask of additional flags to always set.
440; @param 4 The (full) register containing the parity table index. Will be modified!
441;
442%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
443 mov T0_32, [%1] ; Load flags.
444 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
445 %if (%3) != 0
446 or T0_32, %3 ; Add the always set flags.
447 %endif
448 and %4, 0xff
449 %ifdef RT_ARCH_AMD64
450 lea T2, [NAME(g_afParity) xWrtRIP]
451 or T0_8, [T2 + %4]
452 %else
453 or T0_8, [NAME(g_afParity) + %4]
454 %endif
455 mov [%1], T0_32 ; Save the result.
456%endmacro
457
458
459;;
460; Checks that the size expression %1 matches %2 adjusted according to
461; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
462; @param 1 The jump array size assembly expression.
463; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
464;
465%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
467 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
469 %else
470 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
471 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
472 %endif
473%endmacro
474
475
476;*********************************************************************************************************************************
477;* External Symbols *
478;*********************************************************************************************************************************
479extern NAME(g_afParity)
480
481
482;;
483; Macro for implementing a binary operator.
484;
485; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
486; variants, except on 32-bit system where the 64-bit accesses requires hand
487; coding.
488;
489; All the functions takes a pointer to the destination memory operand in A0,
490; the source register operand in A1 and a pointer to eflags in A2.
491;
492; @param 1 The instruction mnemonic.
493; @param 2 Non-zero if there should be a locked version.
494; @param 3 The modified flags.
495; @param 4 The undefined flags.
496; @param 5 Force flag loading (ADC, SBC).
497;
498%macro IEMIMPL_BIN_OP 5
499BEGINCODE
500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
501 PROLOGUE_3_ARGS
502 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
503 %1 byte [A0], A1_8
504 IEM_SAVE_FLAGS A2, %3, %4
505 EPILOGUE_3_ARGS
506ENDPROC iemAImpl_ %+ %1 %+ _u8
507
508BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509 PROLOGUE_3_ARGS
510 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
511 %1 word [A0], A1_16
512 IEM_SAVE_FLAGS A2, %3, %4
513 EPILOGUE_3_ARGS
514ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517 PROLOGUE_3_ARGS
518 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
519 %1 dword [A0], A1_32
520 IEM_SAVE_FLAGS A2, %3, %4
521 EPILOGUE_3_ARGS
522ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524 %ifdef RT_ARCH_AMD64
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
528 %1 qword [A0], A1
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS_EX 8
531ENDPROC iemAImpl_ %+ %1 %+ _u64
532 %endif ; RT_ARCH_AMD64
533
534 %if %2 != 0 ; locked versions requested?
535
536BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
537 PROLOGUE_3_ARGS
538 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
539 lock %1 byte [A0], A1_8
540 IEM_SAVE_FLAGS A2, %3, %4
541 EPILOGUE_3_ARGS
542ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
543
544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
545 PROLOGUE_3_ARGS
546 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
547 lock %1 word [A0], A1_16
548 IEM_SAVE_FLAGS A2, %3, %4
549 EPILOGUE_3_ARGS
550ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
551
552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
553 PROLOGUE_3_ARGS
554 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
555 lock %1 dword [A0], A1_32
556 IEM_SAVE_FLAGS A2, %3, %4
557 EPILOGUE_3_ARGS
558ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
559
560 %ifdef RT_ARCH_AMD64
561BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
562 PROLOGUE_3_ARGS
563 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
564 lock %1 qword [A0], A1
565 IEM_SAVE_FLAGS A2, %3, %4
566 EPILOGUE_3_ARGS_EX 8
567ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
568 %endif ; RT_ARCH_AMD64
569 %endif ; locked
570%endmacro
571
572; instr,lock, modified-flags, undefined flags, force loading flags
573IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
574IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
575IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
576IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
577IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
578IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
579IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
580IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
581IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
582
583
584;;
585; Macro for implementing a binary operator, VEX variant with separate input/output.
586;
587; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
588; where the 64-bit accesses requires hand coding.
589;
590; All the functions takes a pointer to the destination memory operand in A0,
591; the first source register operand in A1, the second source register operand
592; in A2 and a pointer to eflags in A3.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
602 %1 T0_32, A1_32, A2_32
603 mov [A0], T0_32
604 IEM_SAVE_FLAGS A3, %2, %3
605 EPILOGUE_4_ARGS
606ENDPROC iemAImpl_ %+ %1 %+ _u32
607
608 %ifdef RT_ARCH_AMD64
609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
610 PROLOGUE_4_ARGS
611 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
612 %1 T0, A1, A2
613 mov [A0], T0
614 IEM_SAVE_FLAGS A3, %2, %3
615 EPILOGUE_4_ARGS
616ENDPROC iemAImpl_ %+ %1 %+ _u64
617 %endif ; RT_ARCH_AMD64
618%endmacro
619
620; instr, modified-flags, undefined-flags
621IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
623IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624
625;;
626; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
627;
628; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
629; where the 64-bit accesses requires hand coding.
630;
631; All the functions takes a pointer to the destination memory operand in A0,
632; the source register operand in A1 and a pointer to eflags in A2.
633;
634; @param 1 The instruction mnemonic.
635; @param 2 The modified flags.
636; @param 3 The undefined flags.
637;
638%macro IEMIMPL_VEX_BIN_OP_2 3
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0_32, [A0]
643 %1 T0_32, A1_32
644 mov [A0], T0_32
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u32
648
649 %ifdef RT_ARCH_AMD64
650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
651 PROLOGUE_4_ARGS
652 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
653 mov T0, [A0]
654 %1 T0, A1
655 mov [A0], T0
656 IEM_SAVE_FLAGS A2, %2, %3
657 EPILOGUE_4_ARGS
658ENDPROC iemAImpl_ %+ %1 %+ _u64
659 %endif ; RT_ARCH_AMD64
660%endmacro
661
662; instr, modified-flags, undefined-flags
663IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
665IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
666
667
668;;
669; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
670;
671; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
672; where the 64-bit accesses requires hand coding.
673;
674; All the functions takes a pointer to the destination memory operand in A0,
675; the first source register operand in A1, the second source register operand
676; in A2 and a pointer to eflags in A3.
677;
678; @param 1 The instruction mnemonic.
679; @param 2 Fallback instruction if applicable.
680; @param 3 Whether to emit fallback or not.
681;
682%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
684 PROLOGUE_3_ARGS
685 %1 T0_32, A1_32, A2_32
686 mov [A0], T0_32
687 EPILOGUE_3_ARGS
688ENDPROC iemAImpl_ %+ %1 %+ _u32
689
690 %if %3
691BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
692 PROLOGUE_3_ARGS
693 %ifdef ASM_CALL64_GCC
694 mov cl, A2_8
695 %2 A1_32, cl
696 mov [A0], A1_32
697 %else
698 xchg A2, A0
699 %2 A1_32, cl
700 mov [A2], A1_32
701 %endif
702 EPILOGUE_3_ARGS
703ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
704 %endif
705
706 %ifdef RT_ARCH_AMD64
707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
708 PROLOGUE_3_ARGS
709 %1 T0, A1, A2
710 mov [A0], T0
711 EPILOGUE_3_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u64
713
714 %if %3
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
716 PROLOGUE_3_ARGS
717 %ifdef ASM_CALL64_GCC
718 mov cl, A2_8
719 %2 A1, cl
720 mov [A0], A1_32
721 %else
722 xchg A2, A0
723 %2 A1, cl
724 mov [A2], A1_32
725 %endif
726 mov [A0], A1
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
729 %endif
730 %endif ; RT_ARCH_AMD64
731%endmacro
732
733; instr, fallback instr, emit fallback
734IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
735IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
736IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
737IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
738IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
739
740
741;
742; RORX uses a immediate byte for the shift count, so we only do
743; fallback implementation of that one.
744;
745BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
746 PROLOGUE_3_ARGS
747 %ifdef ASM_CALL64_GCC
748 mov cl, A2_8
749 ror A1_32, cl
750 mov [A0], A1_32
751 %else
752 xchg A2, A0
753 ror A1_32, cl
754 mov [A2], A1_32
755 %endif
756 EPILOGUE_3_ARGS
757ENDPROC iemAImpl_rorx_u32
758
759 %ifdef RT_ARCH_AMD64
760BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
761 PROLOGUE_3_ARGS
762 %ifdef ASM_CALL64_GCC
763 mov cl, A2_8
764 ror A1, cl
765 mov [A0], A1
766 %else
767 xchg A2, A0
768 ror A1, cl
769 mov [A2], A1
770 %endif
771 EPILOGUE_3_ARGS
772ENDPROC iemAImpl_rorx_u64
773 %endif ; RT_ARCH_AMD64
774
775
776;
777; MULX
778;
779BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2_32 is EDX - prefect
783 mulx T0_32, T1_32, A3_32
784 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0_32
786%else
787 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
788 xchg A1, A2
789 mulx T0_32, T1_32, A3_32
790 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0_32
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u32
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2_32 is EDX, T0_32 is EAX
801 mov eax, A3_32
802 mul A2_32
803 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], edx
805%else
806 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
807 xchg A1, A2
808 mov eax, A3_32
809 mul A2_32
810 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u32_fallback
815
816%ifdef RT_ARCH_AMD64
817BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
818 PROLOGUE_4_ARGS
819%ifdef ASM_CALL64_GCC
820 ; A2 is RDX - prefect
821 mulx T0, T1, A3
822 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
823 mov [A0], T0
824%else
825 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
826 xchg A1, A2
827 mulx T0, T1, A3
828 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
829 mov [A0], T0
830%endif
831 EPILOGUE_4_ARGS
832ENDPROC iemAImpl_mulx_u64
833
834
835BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
836 PROLOGUE_4_ARGS
837%ifdef ASM_CALL64_GCC
838 ; A2 is RDX, T0 is RAX
839 mov rax, A3
840 mul A2
841 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
842 mov [A0], rdx
843%else
844 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
845 xchg A1, A2
846 mov rax, A3
847 mul A2
848 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%endif
851 EPILOGUE_4_ARGS
852ENDPROC iemAImpl_mulx_u64_fallback
853
854%endif
855
856
857;;
858; Macro for implementing a bit operator.
859;
860; This will generate code for the 16, 32 and 64 bit accesses with locked
861; variants, except on 32-bit system where the 64-bit accesses requires hand
862; coding.
863;
864; All the functions takes a pointer to the destination memory operand in A0,
865; the source register operand in A1 and a pointer to eflags in A2.
866;
867; @param 1 The instruction mnemonic.
868; @param 2 Non-zero if there should be a locked version.
869; @param 3 The modified flags.
870; @param 4 The undefined flags.
871;
872%macro IEMIMPL_BIT_OP 4
873BEGINCODE
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64
898 %endif ; RT_ARCH_AMD64
899
900 %if %2 != 0 ; locked versions requested?
901
902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
903 PROLOGUE_3_ARGS
904 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
905 lock %1 word [A0], A1_16
906 IEM_SAVE_FLAGS A2, %3, %4
907 EPILOGUE_3_ARGS
908ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
913 lock %1 dword [A0], A1_32
914 IEM_SAVE_FLAGS A2, %3, %4
915 EPILOGUE_3_ARGS
916ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
917
918 %ifdef RT_ARCH_AMD64
919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
920 PROLOGUE_3_ARGS
921 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
922 lock %1 qword [A0], A1
923 IEM_SAVE_FLAGS A2, %3, %4
924 EPILOGUE_3_ARGS_EX 8
925ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
926 %endif ; RT_ARCH_AMD64
927 %endif ; locked
928%endmacro
929; modified efl, undefined eflags
930IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
932IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
933IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
934
935;;
936; Macro for implementing a bit search operator.
937;
938; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
939; system where the 64-bit accesses requires hand coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; In the ZF case the destination register is 'undefined', however it seems that
945; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
946; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
947; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
948; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
949;
950; @param 1 The instruction mnemonic.
951; @param 2 The modified flags.
952; @param 3 The undefined flags.
953; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
954;
955%macro IEMIMPL_BIT_OP2 4
956BEGINCODE
957BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
958 PROLOGUE_3_ARGS
959 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
960 %1 T0_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T0_16
965.unchanged_dst:
966 IEM_SAVE_FLAGS A2, %2, %3
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
971 PROLOGUE_3_ARGS
972 %1 T1_16, A1_16
973%if %4 != 0
974 jz .unchanged_dst
975%endif
976 mov [A0], T1_16
977 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
978 EPILOGUE_3_ARGS
979.unchanged_dst:
980 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
983
984BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
985 PROLOGUE_3_ARGS
986 %1 T0_16, A1_16
987%if %4 != 0
988 jz .unchanged_dst
989%endif
990 mov [A0], T0_16
991.unchanged_dst:
992 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
995
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1000 %1 T0_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T0_32
1005.unchanged_dst:
1006 IEM_SAVE_FLAGS A2, %2, %3
1007 EPILOGUE_3_ARGS
1008ENDPROC iemAImpl_ %+ %1 %+ _u32
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1011 PROLOGUE_3_ARGS
1012 %1 T1_32, A1_32
1013%if %4 != 0
1014 jz .unchanged_dst
1015%endif
1016 mov [A0], T1_32
1017 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1018 EPILOGUE_3_ARGS
1019.unchanged_dst:
1020 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1023
1024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1025 PROLOGUE_3_ARGS
1026 %1 T0_32, A1_32
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T0_32
1031.unchanged_dst:
1032 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1033 EPILOGUE_3_ARGS
1034ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1035
1036
1037 %ifdef RT_ARCH_AMD64
1038
1039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1040 PROLOGUE_3_ARGS
1041 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1042 %1 T0, A1
1043%if %4 != 0
1044 jz .unchanged_dst
1045%endif
1046 mov [A0], T0
1047.unchanged_dst:
1048 IEM_SAVE_FLAGS A2, %2, %3
1049 EPILOGUE_3_ARGS_EX 8
1050ENDPROC iemAImpl_ %+ %1 %+ _u64
1051
1052BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1053 PROLOGUE_3_ARGS
1054 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1055 %1 T1, A1
1056%if %4 != 0
1057 jz .unchanged_dst
1058%endif
1059 mov [A0], T1
1060 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1061 EPILOGUE_3_ARGS
1062.unchanged_dst:
1063 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1064 EPILOGUE_3_ARGS
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1066
1067BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1068 PROLOGUE_3_ARGS
1069 %1 T0, A1
1070%if %4 != 0
1071 jz .unchanged_dst
1072%endif
1073 mov [A0], T0
1074.unchanged_dst:
1075 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1076 EPILOGUE_3_ARGS_EX 8
1077ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1078
1079 %endif ; RT_ARCH_AMD64
1080%endmacro
1081
1082IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1083IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1084IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1085IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1086
1087
1088;;
1089; Macro for implementing POPCNT.
1090;
1091; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1092; system where the 64-bit accesses requires hand coding.
1093;
1094; All the functions takes a pointer to the destination memory operand in A0,
1095; the source register operand in A1 and a pointer to eflags in A2.
1096;
1097; ASSUMES Intel and AMD set EFLAGS the same way.
1098;
1099; ASSUMES the instruction does not support memory destination.
1100;
1101; @param 1 The instruction mnemonic.
1102; @param 2 The modified flags.
1103; @param 3 The undefined flags.
1104;
1105%macro IEMIMPL_BIT_OP3 3
1106BEGINCODE
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1108 PROLOGUE_3_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1110 %1 T0_16, A1_16
1111 mov [A0], T0_16
1112 IEM_SAVE_FLAGS A2, %2, %3
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u16
1115
1116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1119 %1 T0_32, A1_32
1120 mov [A0], T0_32
1121 IEM_SAVE_FLAGS A2, %2, %3
1122 EPILOGUE_3_ARGS
1123ENDPROC iemAImpl_ %+ %1 %+ _u32
1124
1125 %ifdef RT_ARCH_AMD64
1126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1129 %1 T0, A1
1130 mov [A0], T0
1131 IEM_SAVE_FLAGS A2, %2, %3
1132 EPILOGUE_3_ARGS_EX 8
1133ENDPROC iemAImpl_ %+ %1 %+ _u64
1134 %endif ; RT_ARCH_AMD64
1135%endmacro
1136IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1137
1138
1139;
1140; IMUL is also a similar but yet different case (no lock, no mem dst).
1141; The rDX:rAX variant of imul is handled together with mul further down.
1142;
1143BEGINCODE
1144; @param 1 EFLAGS that are modified.
1145; @param 2 Undefined EFLAGS.
1146; @param 3 Function suffix.
1147; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1148; 2 for AMD (set AF, clear PF, ZF and SF).
1149%macro IEMIMPL_IMUL_TWO 4
1150BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1151 PROLOGUE_3_ARGS
1152 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1153 imul A1_16, word [A0]
1154 mov [A0], A1_16
1155 %if %4 != 1
1156 IEM_SAVE_FLAGS A2, %1, %2
1157 %else
1158 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1159 %endif
1160 EPILOGUE_3_ARGS
1161ENDPROC iemAImpl_imul_two_u16 %+ %3
1162
1163BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1164 PROLOGUE_3_ARGS
1165 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1166 imul A1_32, dword [A0]
1167 mov [A0], A1_32
1168 %if %4 != 1
1169 IEM_SAVE_FLAGS A2, %1, %2
1170 %else
1171 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1172 %endif
1173 EPILOGUE_3_ARGS
1174ENDPROC iemAImpl_imul_two_u32 %+ %3
1175
1176 %ifdef RT_ARCH_AMD64
1177BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1178 PROLOGUE_3_ARGS
1179 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1180 imul A1, qword [A0]
1181 mov [A0], A1
1182 %if %4 != 1
1183 IEM_SAVE_FLAGS A2, %1, %2
1184 %else
1185 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1186 %endif
1187 EPILOGUE_3_ARGS_EX 8
1188ENDPROC iemAImpl_imul_two_u64 %+ %3
1189 %endif ; RT_ARCH_AMD64
1190%endmacro
1191IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1192IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1193IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1194
1195
1196;
1197; XCHG for memory operands. This implies locking. No flag changes.
1198;
1199; Each function takes two arguments, first the pointer to the memory,
1200; then the pointer to the register. They all return void.
1201;
1202BEGINCODE
1203BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1204 PROLOGUE_2_ARGS
1205 mov T0_8, [A1]
1206 xchg [A0], T0_8
1207 mov [A1], T0_8
1208 EPILOGUE_2_ARGS
1209ENDPROC iemAImpl_xchg_u8_locked
1210
1211BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1212 PROLOGUE_2_ARGS
1213 mov T0_16, [A1]
1214 xchg [A0], T0_16
1215 mov [A1], T0_16
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u16_locked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_32, [A1]
1222 xchg [A0], T0_32
1223 mov [A1], T0_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_locked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 xchg [A0], T0
1232 mov [A1], T0
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u64_locked
1235%endif
1236
1237; Unlocked variants for fDisregardLock mode.
1238
1239BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1240 PROLOGUE_2_ARGS
1241 mov T0_8, [A1]
1242 mov T1_8, [A0]
1243 mov [A0], T0_8
1244 mov [A1], T1_8
1245 EPILOGUE_2_ARGS
1246ENDPROC iemAImpl_xchg_u8_unlocked
1247
1248BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1249 PROLOGUE_2_ARGS
1250 mov T0_16, [A1]
1251 mov T1_16, [A0]
1252 mov [A0], T0_16
1253 mov [A1], T1_16
1254 EPILOGUE_2_ARGS
1255ENDPROC iemAImpl_xchg_u16_unlocked
1256
1257BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1258 PROLOGUE_2_ARGS
1259 mov T0_32, [A1]
1260 mov T1_32, [A0]
1261 mov [A0], T0_32
1262 mov [A1], T1_32
1263 EPILOGUE_2_ARGS
1264ENDPROC iemAImpl_xchg_u32_unlocked
1265
1266%ifdef RT_ARCH_AMD64
1267BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1268 PROLOGUE_2_ARGS
1269 mov T0, [A1]
1270 mov T1, [A0]
1271 mov [A0], T0
1272 mov [A1], T1
1273 EPILOGUE_2_ARGS
1274ENDPROC iemAImpl_xchg_u64_unlocked
1275%endif
1276
1277
1278;
1279; XADD for memory operands.
1280;
1281; Each function takes three arguments, first the pointer to the
1282; memory/register, then the pointer to the register, and finally a pointer to
1283; eflags. They all return void.
1284;
1285BEGINCODE
1286BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1287 PROLOGUE_3_ARGS
1288 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1289 mov T0_8, [A1]
1290 xadd [A0], T0_8
1291 mov [A1], T0_8
1292 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1293 EPILOGUE_3_ARGS
1294ENDPROC iemAImpl_xadd_u8
1295
1296BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1297 PROLOGUE_3_ARGS
1298 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1299 mov T0_16, [A1]
1300 xadd [A0], T0_16
1301 mov [A1], T0_16
1302 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1303 EPILOGUE_3_ARGS
1304ENDPROC iemAImpl_xadd_u16
1305
1306BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1309 mov T0_32, [A1]
1310 xadd [A0], T0_32
1311 mov [A1], T0_32
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u32
1315
1316%ifdef RT_ARCH_AMD64
1317BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1320 mov T0, [A1]
1321 xadd [A0], T0
1322 mov [A1], T0
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u64
1326%endif ; RT_ARCH_AMD64
1327
1328BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1329 PROLOGUE_3_ARGS
1330 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1331 mov T0_8, [A1]
1332 lock xadd [A0], T0_8
1333 mov [A1], T0_8
1334 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_xadd_u8_locked
1337
1338BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1341 mov T0_16, [A1]
1342 lock xadd [A0], T0_16
1343 mov [A1], T0_16
1344 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1345 EPILOGUE_3_ARGS
1346ENDPROC iemAImpl_xadd_u16_locked
1347
1348BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1351 mov T0_32, [A1]
1352 lock xadd [A0], T0_32
1353 mov [A1], T0_32
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u32_locked
1357
1358%ifdef RT_ARCH_AMD64
1359BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1362 mov T0, [A1]
1363 lock xadd [A0], T0
1364 mov [A1], T0
1365 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1366 EPILOGUE_3_ARGS
1367ENDPROC iemAImpl_xadd_u64_locked
1368%endif ; RT_ARCH_AMD64
1369
1370
1371;
1372; CMPXCHG8B.
1373;
1374; These are tricky register wise, so the code is duplicated for each calling
1375; convention.
1376;
1377; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1378;
1379; C-proto:
1380; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1381; uint32_t *pEFlags));
1382;
1383; Note! Identical to iemAImpl_cmpxchg16b.
1384;
1385BEGINCODE
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1387%ifdef RT_ARCH_AMD64
1388 %ifdef ASM_CALL64_MSC
1389 push rbx
1390
1391 mov r11, rdx ; pu64EaxEdx (is also T1)
1392 mov r10, rcx ; pu64Dst
1393
1394 mov ebx, [r8]
1395 mov ecx, [r8 + 4]
1396 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1397 mov eax, [r11]
1398 mov edx, [r11 + 4]
1399
1400 cmpxchg8b [r10]
1401
1402 mov [r11], eax
1403 mov [r11 + 4], edx
1404 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1405
1406 pop rbx
1407 ret
1408 %else
1409 push rbx
1410
1411 mov r10, rcx ; pEFlags
1412 mov r11, rdx ; pu64EbxEcx (is also T1)
1413
1414 mov ebx, [r11]
1415 mov ecx, [r11 + 4]
1416 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1417 mov eax, [rsi]
1418 mov edx, [rsi + 4]
1419
1420 cmpxchg8b [rdi]
1421
1422 mov [rsi], eax
1423 mov [rsi + 4], edx
1424 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1425
1426 pop rbx
1427 ret
1428
1429 %endif
1430%else
1431 push esi
1432 push edi
1433 push ebx
1434 push ebp
1435
1436 mov edi, ecx ; pu64Dst
1437 mov esi, edx ; pu64EaxEdx
1438 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1439 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1440
1441 mov ebx, [ecx]
1442 mov ecx, [ecx + 4]
1443 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1444 mov eax, [esi]
1445 mov edx, [esi + 4]
1446
1447 cmpxchg8b [edi]
1448
1449 mov [esi], eax
1450 mov [esi + 4], edx
1451 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1452
1453 pop ebp
1454 pop ebx
1455 pop edi
1456 pop esi
1457 ret 8
1458%endif
1459ENDPROC iemAImpl_cmpxchg8b
1460
1461BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1462%ifdef RT_ARCH_AMD64
1463 %ifdef ASM_CALL64_MSC
1464 push rbx
1465
1466 mov r11, rdx ; pu64EaxEdx (is also T1)
1467 mov r10, rcx ; pu64Dst
1468
1469 mov ebx, [r8]
1470 mov ecx, [r8 + 4]
1471 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1472 mov eax, [r11]
1473 mov edx, [r11 + 4]
1474
1475 lock cmpxchg8b [r10]
1476
1477 mov [r11], eax
1478 mov [r11 + 4], edx
1479 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1480
1481 pop rbx
1482 ret
1483 %else
1484 push rbx
1485
1486 mov r10, rcx ; pEFlags
1487 mov r11, rdx ; pu64EbxEcx (is also T1)
1488
1489 mov ebx, [r11]
1490 mov ecx, [r11 + 4]
1491 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1492 mov eax, [rsi]
1493 mov edx, [rsi + 4]
1494
1495 lock cmpxchg8b [rdi]
1496
1497 mov [rsi], eax
1498 mov [rsi + 4], edx
1499 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1500
1501 pop rbx
1502 ret
1503
1504 %endif
1505%else
1506 push esi
1507 push edi
1508 push ebx
1509 push ebp
1510
1511 mov edi, ecx ; pu64Dst
1512 mov esi, edx ; pu64EaxEdx
1513 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1514 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1515
1516 mov ebx, [ecx]
1517 mov ecx, [ecx + 4]
1518 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1519 mov eax, [esi]
1520 mov edx, [esi + 4]
1521
1522 lock cmpxchg8b [edi]
1523
1524 mov [esi], eax
1525 mov [esi + 4], edx
1526 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1527
1528 pop ebp
1529 pop ebx
1530 pop edi
1531 pop esi
1532 ret 8
1533%endif
1534ENDPROC iemAImpl_cmpxchg8b_locked
1535
1536%ifdef RT_ARCH_AMD64
1537
1538;
1539; CMPXCHG16B.
1540;
1541; These are tricky register wise, so the code is duplicated for each calling
1542; convention.
1543;
1544; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1545;
1546; C-proto:
1547; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1548; uint32_t *pEFlags));
1549;
1550; Note! Identical to iemAImpl_cmpxchg8b.
1551;
1552BEGINCODE
1553BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1554 %ifdef ASM_CALL64_MSC
1555 push rbx
1556
1557 mov r11, rdx ; pu64RaxRdx (is also T1)
1558 mov r10, rcx ; pu64Dst
1559
1560 mov rbx, [r8]
1561 mov rcx, [r8 + 8]
1562 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1563 mov rax, [r11]
1564 mov rdx, [r11 + 8]
1565
1566 cmpxchg16b [r10]
1567
1568 mov [r11], rax
1569 mov [r11 + 8], rdx
1570 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1571
1572 pop rbx
1573 ret
1574 %else
1575 push rbx
1576
1577 mov r10, rcx ; pEFlags
1578 mov r11, rdx ; pu64RbxRcx (is also T1)
1579
1580 mov rbx, [r11]
1581 mov rcx, [r11 + 8]
1582 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1583 mov rax, [rsi]
1584 mov rdx, [rsi + 8]
1585
1586 cmpxchg16b [rdi]
1587
1588 mov [rsi], rax
1589 mov [rsi + 8], rdx
1590 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1591
1592 pop rbx
1593 ret
1594
1595 %endif
1596ENDPROC iemAImpl_cmpxchg16b
1597
1598BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1599 %ifdef ASM_CALL64_MSC
1600 push rbx
1601
1602 mov r11, rdx ; pu64RaxRdx (is also T1)
1603 mov r10, rcx ; pu64Dst
1604
1605 mov rbx, [r8]
1606 mov rcx, [r8 + 8]
1607 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1608 mov rax, [r11]
1609 mov rdx, [r11 + 8]
1610
1611 lock cmpxchg16b [r10]
1612
1613 mov [r11], rax
1614 mov [r11 + 8], rdx
1615 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1616
1617 pop rbx
1618 ret
1619 %else
1620 push rbx
1621
1622 mov r10, rcx ; pEFlags
1623 mov r11, rdx ; pu64RbxRcx (is also T1)
1624
1625 mov rbx, [r11]
1626 mov rcx, [r11 + 8]
1627 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1628 mov rax, [rsi]
1629 mov rdx, [rsi + 8]
1630
1631 lock cmpxchg16b [rdi]
1632
1633 mov [rsi], rax
1634 mov [rsi + 8], rdx
1635 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1636
1637 pop rbx
1638 ret
1639
1640 %endif
1641ENDPROC iemAImpl_cmpxchg16b_locked
1642
1643%endif ; RT_ARCH_AMD64
1644
1645
1646;
1647; CMPXCHG.
1648;
1649; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1650;
1651; C-proto:
1652; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1653;
1654BEGINCODE
1655%macro IEMIMPL_CMPXCHG 2
1656BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1657 PROLOGUE_4_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1659 mov al, [A1]
1660 %1 cmpxchg [A0], A2_8
1661 mov [A1], al
1662 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1663 EPILOGUE_4_ARGS
1664ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1665
1666BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1667 PROLOGUE_4_ARGS
1668 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1669 mov ax, [A1]
1670 %1 cmpxchg [A0], A2_16
1671 mov [A1], ax
1672 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1673 EPILOGUE_4_ARGS
1674ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1675
1676BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1677 PROLOGUE_4_ARGS
1678 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1679 mov eax, [A1]
1680 %1 cmpxchg [A0], A2_32
1681 mov [A1], eax
1682 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1683 EPILOGUE_4_ARGS
1684ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1685
1686BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1687%ifdef RT_ARCH_AMD64
1688 PROLOGUE_4_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1690 mov rax, [A1]
1691 %1 cmpxchg [A0], A2
1692 mov [A1], rax
1693 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1694 EPILOGUE_4_ARGS
1695%else
1696 ;
1697 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1698 ;
1699 push esi
1700 push edi
1701 push ebx
1702 push ebp
1703
1704 mov edi, ecx ; pu64Dst
1705 mov esi, edx ; pu64Rax
1706 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1707 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1708
1709 mov ebx, [ecx]
1710 mov ecx, [ecx + 4]
1711 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1712 mov eax, [esi]
1713 mov edx, [esi + 4]
1714
1715 lock cmpxchg8b [edi]
1716
1717 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1718 jz .cmpxchg8b_not_equal
1719;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1720 cmp eax, eax ; just set the other flags.
1721.store:
1722 mov [esi], eax
1723 mov [esi + 4], edx
1724 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1725
1726 pop ebp
1727 pop ebx
1728 pop edi
1729 pop esi
1730 ret 8
1731
1732.cmpxchg8b_not_equal:
1733 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1734 jne .store
1735 cmp [esi], eax
1736 jmp .store
1737
1738%endif
1739ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1740%endmacro ; IEMIMPL_CMPXCHG
1741
1742IEMIMPL_CMPXCHG , ,
1743IEMIMPL_CMPXCHG lock, _locked
1744
1745;;
1746; Macro for implementing a unary operator.
1747;
1748; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1749; variants, except on 32-bit system where the 64-bit accesses requires hand
1750; coding.
1751;
1752; All the functions takes a pointer to the destination memory operand in A0,
1753; the source register operand in A1 and a pointer to eflags in A2.
1754;
1755; @param 1 The instruction mnemonic.
1756; @param 2 The modified flags.
1757; @param 3 The undefined flags.
1758;
1759%macro IEMIMPL_UNARY_OP 3
1760BEGINCODE
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1762 PROLOGUE_2_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1764 %1 byte [A0]
1765 IEM_SAVE_FLAGS A1, %2, %3
1766 EPILOGUE_2_ARGS
1767ENDPROC iemAImpl_ %+ %1 %+ _u8
1768
1769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1770 PROLOGUE_2_ARGS
1771 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1772 lock %1 byte [A0]
1773 IEM_SAVE_FLAGS A1, %2, %3
1774 EPILOGUE_2_ARGS
1775ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1776
1777BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1778 PROLOGUE_2_ARGS
1779 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1780 %1 word [A0]
1781 IEM_SAVE_FLAGS A1, %2, %3
1782 EPILOGUE_2_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1786 PROLOGUE_2_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1788 lock %1 word [A0]
1789 IEM_SAVE_FLAGS A1, %2, %3
1790 EPILOGUE_2_ARGS
1791ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1792
1793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1794 PROLOGUE_2_ARGS
1795 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1796 %1 dword [A0]
1797 IEM_SAVE_FLAGS A1, %2, %3
1798 EPILOGUE_2_ARGS
1799ENDPROC iemAImpl_ %+ %1 %+ _u32
1800
1801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1802 PROLOGUE_2_ARGS
1803 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1804 lock %1 dword [A0]
1805 IEM_SAVE_FLAGS A1, %2, %3
1806 EPILOGUE_2_ARGS
1807ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1808
1809 %ifdef RT_ARCH_AMD64
1810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1811 PROLOGUE_2_ARGS
1812 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1813 %1 qword [A0]
1814 IEM_SAVE_FLAGS A1, %2, %3
1815 EPILOGUE_2_ARGS
1816ENDPROC iemAImpl_ %+ %1 %+ _u64
1817
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1819 PROLOGUE_2_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1821 lock %1 qword [A0]
1822 IEM_SAVE_FLAGS A1, %2, %3
1823 EPILOGUE_2_ARGS
1824ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1825 %endif ; RT_ARCH_AMD64
1826
1827%endmacro
1828
1829IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1830IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1831IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1832IEMIMPL_UNARY_OP not, 0, 0
1833
1834
1835;
1836; BSWAP. No flag changes.
1837;
1838; Each function takes one argument, pointer to the value to bswap
1839; (input/output). They all return void.
1840;
1841BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1842 PROLOGUE_1_ARGS
1843 mov T0_32, [A0] ; just in case any of the upper bits are used.
1844 db 66h
1845 bswap T0_32
1846 mov [A0], T0_32
1847 EPILOGUE_1_ARGS
1848ENDPROC iemAImpl_bswap_u16
1849
1850BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1851 PROLOGUE_1_ARGS
1852 mov T0_32, [A0]
1853 bswap T0_32
1854 mov [A0], T0_32
1855 EPILOGUE_1_ARGS
1856ENDPROC iemAImpl_bswap_u32
1857
1858BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1859%ifdef RT_ARCH_AMD64
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 bswap T0
1863 mov [A0], T0
1864 EPILOGUE_1_ARGS
1865%else
1866 PROLOGUE_1_ARGS
1867 mov T0, [A0]
1868 mov T1, [A0 + 4]
1869 bswap T0
1870 bswap T1
1871 mov [A0 + 4], T0
1872 mov [A0], T1
1873 EPILOGUE_1_ARGS
1874%endif
1875ENDPROC iemAImpl_bswap_u64
1876
1877
1878;;
1879; Macro for implementing a shift operation.
1880;
1881; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1882; 32-bit system where the 64-bit accesses requires hand coding.
1883;
1884; All the functions takes a pointer to the destination memory operand in A0,
1885; the shift count in A1 and a pointer to eflags in A2.
1886;
1887; @param 1 The instruction mnemonic.
1888; @param 2 The modified flags.
1889; @param 3 The undefined flags.
1890; @param 4 Force load flags.
1891;
1892; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1893;
1894; @note the _intel and _amd variants are implemented in C.
1895;
1896%macro IEMIMPL_SHIFT_OP 4
1897BEGINCODE
1898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1899 PROLOGUE_3_ARGS
1900 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1901 %ifdef ASM_CALL64_GCC
1902 mov cl, A1_8
1903 %1 byte [A0], cl
1904 %else
1905 xchg A1, A0
1906 %1 byte [A1], cl
1907 %endif
1908 IEM_SAVE_FLAGS A2, %2, %3
1909 EPILOGUE_3_ARGS
1910ENDPROC iemAImpl_ %+ %1 %+ _u8
1911
1912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1913 PROLOGUE_3_ARGS
1914 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1915 %ifdef ASM_CALL64_GCC
1916 mov cl, A1_8
1917 %1 word [A0], cl
1918 %else
1919 xchg A1, A0
1920 %1 word [A1], cl
1921 %endif
1922 IEM_SAVE_FLAGS A2, %2, %3
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u16
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1927 PROLOGUE_3_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1929 %ifdef ASM_CALL64_GCC
1930 mov cl, A1_8
1931 %1 dword [A0], cl
1932 %else
1933 xchg A1, A0
1934 %1 dword [A1], cl
1935 %endif
1936 IEM_SAVE_FLAGS A2, %2, %3
1937 EPILOGUE_3_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u32
1939
1940 %ifdef RT_ARCH_AMD64
1941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1942 PROLOGUE_3_ARGS
1943 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1944 %ifdef ASM_CALL64_GCC
1945 mov cl, A1_8
1946 %1 qword [A0], cl
1947 %else
1948 xchg A1, A0
1949 %1 qword [A1], cl
1950 %endif
1951 IEM_SAVE_FLAGS A2, %2, %3
1952 EPILOGUE_3_ARGS
1953ENDPROC iemAImpl_ %+ %1 %+ _u64
1954 %endif ; RT_ARCH_AMD64
1955
1956%endmacro
1957
1958;; @todo some questions wrt flags when the shift count is high according to intel docs...
1959IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1960IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1961IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1962IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1963IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1964IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1965IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1966
1967
1968;;
1969; Macro for implementing a double precision shift operation.
1970;
1971; This will generate code for the 16, 32 and 64 bit accesses, except on
1972; 32-bit system where the 64-bit accesses requires hand coding.
1973;
1974; The functions takes the destination operand (r/m) in A0, the source (reg) in
1975; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1976;
1977; @param 1 The instruction mnemonic.
1978; @param 2 The modified flags.
1979; @param 3 The undefined flags.
1980;
1981; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1982;
1983; @note the _intel and _amd variants are implemented in C.
1984;
1985%macro IEMIMPL_SHIFT_DBL_OP 3
1986BEGINCODE
1987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1988 PROLOGUE_4_ARGS
1989 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1990 %ifdef ASM_CALL64_GCC
1991 xchg A3, A2
1992 %1 [A0], A1_16, cl
1993 xchg A3, A2
1994 %else
1995 xchg A0, A2
1996 %1 [A2], A1_16, cl
1997 %endif
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 EPILOGUE_4_ARGS
2000ENDPROC iemAImpl_ %+ %1 %+ _u16
2001
2002BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2003 PROLOGUE_4_ARGS
2004 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2005 %ifdef ASM_CALL64_GCC
2006 xchg A3, A2
2007 %1 [A0], A1_32, cl
2008 xchg A3, A2
2009 %else
2010 xchg A0, A2
2011 %1 [A2], A1_32, cl
2012 %endif
2013 IEM_SAVE_FLAGS A3, %2, %3
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32
2016
2017 %ifdef RT_ARCH_AMD64
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 %ifdef ASM_CALL64_GCC
2022 xchg A3, A2
2023 %1 [A0], A1, cl
2024 xchg A3, A2
2025 %else
2026 xchg A0, A2
2027 %1 [A2], A1, cl
2028 %endif
2029 IEM_SAVE_FLAGS A3, %2, %3
2030 EPILOGUE_4_ARGS_EX 12
2031ENDPROC iemAImpl_ %+ %1 %+ _u64
2032 %endif ; RT_ARCH_AMD64
2033
2034%endmacro
2035
2036IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2037IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2038
2039
2040;;
2041; Macro for implementing a multiplication operations.
2042;
2043; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2044; 32-bit system where the 64-bit accesses requires hand coding.
2045;
2046; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2047; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2048; pointer to eflags in A3.
2049;
2050; The functions all return 0 so the caller can be used for div/idiv as well as
2051; for the mul/imul implementation.
2052;
2053; @param 1 The instruction mnemonic.
2054; @param 2 The modified flags.
2055; @param 3 The undefined flags.
2056; @param 4 Name suffix.
2057; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2058;
2059; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2060;
2061%macro IEMIMPL_MUL_OP 5
2062BEGINCODE
2063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2064 PROLOGUE_3_ARGS
2065 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2066 mov al, [A0]
2067 %1 A1_8
2068 mov [A0], ax
2069 %if %5 != 1
2070 IEM_SAVE_FLAGS A2, %2, %3
2071 %else
2072 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2073 %endif
2074 xor eax, eax
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2077
2078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2079 PROLOGUE_4_ARGS
2080 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2081 mov ax, [A0]
2082 %ifdef ASM_CALL64_GCC
2083 %1 A2_16
2084 mov [A0], ax
2085 mov [A1], dx
2086 %else
2087 mov T1, A1
2088 %1 A2_16
2089 mov [A0], ax
2090 mov [T1], dx
2091 %endif
2092 %if %5 != 1
2093 IEM_SAVE_FLAGS A3, %2, %3
2094 %else
2095 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2096 %endif
2097 xor eax, eax
2098 EPILOGUE_4_ARGS
2099ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2100
2101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2102 PROLOGUE_4_ARGS
2103 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2104 mov eax, [A0]
2105 %ifdef ASM_CALL64_GCC
2106 %1 A2_32
2107 mov [A0], eax
2108 mov [A1], edx
2109 %else
2110 mov T1, A1
2111 %1 A2_32
2112 mov [A0], eax
2113 mov [T1], edx
2114 %endif
2115 %if %5 != 1
2116 IEM_SAVE_FLAGS A3, %2, %3
2117 %else
2118 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2119 %endif
2120 xor eax, eax
2121 EPILOGUE_4_ARGS
2122ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2123
2124 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2128 mov rax, [A0]
2129 %ifdef ASM_CALL64_GCC
2130 %1 A2
2131 mov [A0], rax
2132 mov [A1], rdx
2133 %else
2134 mov T1, A1
2135 %1 A2
2136 mov [A0], rax
2137 mov [T1], rdx
2138 %endif
2139 %if %5 != 1
2140 IEM_SAVE_FLAGS A3, %2, %3
2141 %else
2142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2143 %endif
2144 xor eax, eax
2145 EPILOGUE_4_ARGS_EX 12
2146ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2147 %endif ; !RT_ARCH_AMD64
2148
2149%endmacro
2150
2151IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2152IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2153IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2154IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157
2158
2159BEGINCODE
2160;;
2161; Worker function for negating a 32-bit number in T1:T0
2162; @uses None (T0,T1)
2163BEGINPROC iemAImpl_negate_T0_T1_u32
2164 push 0
2165 push 0
2166 xchg T0_32, [xSP]
2167 xchg T1_32, [xSP + xCB]
2168 sub T0_32, [xSP]
2169 sbb T1_32, [xSP + xCB]
2170 add xSP, xCB*2
2171 ret
2172ENDPROC iemAImpl_negate_T0_T1_u32
2173
2174%ifdef RT_ARCH_AMD64
2175;;
2176; Worker function for negating a 64-bit number in T1:T0
2177; @uses None (T0,T1)
2178BEGINPROC iemAImpl_negate_T0_T1_u64
2179 push 0
2180 push 0
2181 xchg T0, [xSP]
2182 xchg T1, [xSP + xCB]
2183 sub T0, [xSP]
2184 sbb T1, [xSP + xCB]
2185 add xSP, xCB*2
2186 ret
2187ENDPROC iemAImpl_negate_T0_T1_u64
2188%endif
2189
2190
2191;;
2192; Macro for implementing a division operations.
2193;
2194; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2195; 32-bit system where the 64-bit accesses requires hand coding.
2196;
2197; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2198; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2199; pointer to eflags in A3.
2200;
2201; The functions all return 0 on success and -1 if a divide error should be
2202; raised by the caller.
2203;
2204; @param 1 The instruction mnemonic.
2205; @param 2 The modified flags.
2206; @param 3 The undefined flags.
2207; @param 4 1 if signed, 0 if unsigned.
2208; @param 5 Function suffix.
2209; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2210; 2 for AMD (set AF, clear PF, ZF and SF).
2211;
2212; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2213;
2214%macro IEMIMPL_DIV_OP 6
2215BEGINCODE
2216BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2217 PROLOGUE_3_ARGS
2218
2219 ; div by chainsaw check.
2220 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2221 jz .div_zero
2222
2223 ; Overflow check - unsigned division is simple to verify, haven't
2224 ; found a simple way to check signed division yet unfortunately.
2225 %if %4 == 0
2226 cmp [A0 + 1], A1_8
2227 jae .div_overflow
2228 %else
2229 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2230 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2231 test A1_8, A1_8
2232 js .divisor_negative
2233 test T0_16, T0_16
2234 jns .both_positive
2235 neg T0_16
2236.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2237 push T0 ; Start off like unsigned below.
2238 shr T0_16, 7
2239 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2240 pop T0
2241 jb .div_no_overflow
2242 ja .div_overflow
2243 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2244 cmp T0_8, A1_8
2245 jae .div_overflow
2246 jmp .div_no_overflow
2247
2248.divisor_negative:
2249 neg A1_8
2250 test T0_16, T0_16
2251 jns .one_of_each
2252 neg T0_16
2253.both_positive: ; Same as unsigned shifted by sign indicator bit.
2254 shr T0_16, 7
2255 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2256 jae .div_overflow
2257.div_no_overflow:
2258 mov A1, T1 ; restore divisor
2259 %endif
2260
2261 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2262 mov ax, [A0]
2263 %1 A1_8
2264 mov [A0], ax
2265 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2266 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2267 %else
2268 IEM_SAVE_FLAGS A2, %2, %3
2269 %endif
2270 xor eax, eax
2271
2272.return:
2273 EPILOGUE_3_ARGS
2274
2275.div_zero:
2276.div_overflow:
2277 mov eax, -1
2278 jmp .return
2279ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2280
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2282 PROLOGUE_4_ARGS
2283
2284 ; div by chainsaw check.
2285 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2286 jz .div_zero
2287
2288 ; Overflow check - unsigned division is simple to verify, haven't
2289 ; found a simple way to check signed division yet unfortunately.
2290 %if %4 == 0
2291 cmp [A1], A2_16
2292 jae .div_overflow
2293 %else
2294 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2295 shl T0_32, 16
2296 mov T0_16, [A0] ; T0 = dividend
2297 mov T1, A2 ; T1 = divisor
2298 test T1_16, T1_16
2299 js .divisor_negative
2300 test T0_32, T0_32
2301 jns .both_positive
2302 neg T0_32
2303.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2304 push T0 ; Start off like unsigned below.
2305 shr T0_32, 15
2306 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2307 pop T0
2308 jb .div_no_overflow
2309 ja .div_overflow
2310 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2311 cmp T0_16, T1_16
2312 jae .div_overflow
2313 jmp .div_no_overflow
2314
2315.divisor_negative:
2316 neg T1_16
2317 test T0_32, T0_32
2318 jns .one_of_each
2319 neg T0_32
2320.both_positive: ; Same as unsigned shifted by sign indicator bit.
2321 shr T0_32, 15
2322 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2323 jae .div_overflow
2324.div_no_overflow:
2325 %endif
2326
2327 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2328 %ifdef ASM_CALL64_GCC
2329 mov T1, A2
2330 mov ax, [A0]
2331 mov dx, [A1]
2332 %1 T1_16
2333 mov [A0], ax
2334 mov [A1], dx
2335 %else
2336 mov T1, A1
2337 mov ax, [A0]
2338 mov dx, [T1]
2339 %1 A2_16
2340 mov [A0], ax
2341 mov [T1], dx
2342 %endif
2343 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2344 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2345 %else
2346 IEM_SAVE_FLAGS A3, %2, %3
2347 %endif
2348 xor eax, eax
2349
2350.return:
2351 EPILOGUE_4_ARGS
2352
2353.div_zero:
2354.div_overflow:
2355 mov eax, -1
2356 jmp .return
2357ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2358
2359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2360 PROLOGUE_4_ARGS
2361
2362 ; div by chainsaw check.
2363 test A2_32, A2_32
2364 jz .div_zero
2365
2366 ; Overflow check - unsigned division is simple to verify, haven't
2367 ; found a simple way to check signed division yet unfortunately.
2368 %if %4 == 0
2369 cmp [A1], A2_32
2370 jae .div_overflow
2371 %else
2372 push A2 ; save A2 so we modify it (we out of regs on x86).
2373 mov T0_32, [A0] ; T0 = dividend low
2374 mov T1_32, [A1] ; T1 = dividend high
2375 ;test A2_32, A2_32 - we did this 5 instructions ago.
2376 js .divisor_negative
2377 test T1_32, T1_32
2378 jns .both_positive
2379 call NAME(iemAImpl_negate_T0_T1_u32)
2380.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2381 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2382 jnz .div_overflow
2383 push T0 ; Start off like unsigned below.
2384 shl T1_32, 1
2385 shr T0_32, 31
2386 or T1_32, T0_32
2387 cmp T1_32, A2_32
2388 pop T0
2389 jb .div_no_overflow
2390 ja .div_overflow
2391 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2392 cmp T0_32, A2_32
2393 jae .div_overflow
2394 jmp .div_no_overflow
2395
2396.divisor_negative:
2397 neg A2_32
2398 test T1_32, T1_32
2399 jns .one_of_each
2400 call NAME(iemAImpl_negate_T0_T1_u32)
2401.both_positive: ; Same as unsigned shifted by sign indicator bit.
2402 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2403 jnz .div_overflow
2404 shl T1_32, 1
2405 shr T0_32, 31
2406 or T1_32, T0_32
2407 cmp T1_32, A2_32
2408 jae .div_overflow
2409.div_no_overflow:
2410 pop A2
2411 %endif
2412
2413 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2414 mov eax, [A0]
2415 %ifdef ASM_CALL64_GCC
2416 mov T1, A2
2417 mov eax, [A0]
2418 mov edx, [A1]
2419 %1 T1_32
2420 mov [A0], eax
2421 mov [A1], edx
2422 %else
2423 mov T1, A1
2424 mov eax, [A0]
2425 mov edx, [T1]
2426 %1 A2_32
2427 mov [A0], eax
2428 mov [T1], edx
2429 %endif
2430 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2431 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2432 %else
2433 IEM_SAVE_FLAGS A3, %2, %3
2434 %endif
2435 xor eax, eax
2436
2437.return:
2438 EPILOGUE_4_ARGS
2439
2440.div_overflow:
2441 %if %4 != 0
2442 pop A2
2443 %endif
2444.div_zero:
2445 mov eax, -1
2446 jmp .return
2447ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2448
2449 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2450BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2451 PROLOGUE_4_ARGS
2452
2453 test A2, A2
2454 jz .div_zero
2455 %if %4 == 0
2456 cmp [A1], A2
2457 jae .div_overflow
2458 %else
2459 push A2 ; save A2 so we modify it (we out of regs on x86).
2460 mov T0, [A0] ; T0 = dividend low
2461 mov T1, [A1] ; T1 = dividend high
2462 ;test A2, A2 - we did this five instructions above.
2463 js .divisor_negative
2464 test T1, T1
2465 jns .both_positive
2466 call NAME(iemAImpl_negate_T0_T1_u64)
2467.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2468 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2469 jc .div_overflow
2470 push T0 ; Start off like unsigned below.
2471 shl T1, 1
2472 shr T0, 63
2473 or T1, T0
2474 cmp T1, A2
2475 pop T0
2476 jb .div_no_overflow
2477 ja .div_overflow
2478 mov T1, 0x7fffffffffffffff
2479 and T0, T1 ; Special case for covering (divisor - 1).
2480 cmp T0, A2
2481 jae .div_overflow
2482 jmp .div_no_overflow
2483
2484.divisor_negative:
2485 neg A2
2486 test T1, T1
2487 jns .one_of_each
2488 call NAME(iemAImpl_negate_T0_T1_u64)
2489.both_positive: ; Same as unsigned shifted by sign indicator bit.
2490 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2491 jc .div_overflow
2492 shl T1, 1
2493 shr T0, 63
2494 or T1, T0
2495 cmp T1, A2
2496 jae .div_overflow
2497.div_no_overflow:
2498 pop A2
2499 %endif
2500
2501 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2502 mov rax, [A0]
2503 %ifdef ASM_CALL64_GCC
2504 mov T1, A2
2505 mov rax, [A0]
2506 mov rdx, [A1]
2507 %1 T1
2508 mov [A0], rax
2509 mov [A1], rdx
2510 %else
2511 mov T1, A1
2512 mov rax, [A0]
2513 mov rdx, [T1]
2514 %1 A2
2515 mov [A0], rax
2516 mov [T1], rdx
2517 %endif
2518 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2519 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2520 %else
2521 IEM_SAVE_FLAGS A3, %2, %3
2522 %endif
2523 xor eax, eax
2524
2525.return:
2526 EPILOGUE_4_ARGS_EX 12
2527
2528.div_overflow:
2529 %if %4 != 0
2530 pop A2
2531 %endif
2532.div_zero:
2533 mov eax, -1
2534 jmp .return
2535ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2536 %endif ; !RT_ARCH_AMD64
2537
2538%endmacro
2539
2540IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2541IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2542IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2543;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
2544IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2545IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2546IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2547
2548
2549;;
2550; Macro for implementing memory fence operation.
2551;
2552; No return value, no operands or anything.
2553;
2554; @param 1 The instruction.
2555;
2556%macro IEMIMPL_MEM_FENCE 1
2557BEGINCODE
2558BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2559 %1
2560 ret
2561ENDPROC iemAImpl_ %+ %1
2562%endmacro
2563
2564IEMIMPL_MEM_FENCE lfence
2565IEMIMPL_MEM_FENCE sfence
2566IEMIMPL_MEM_FENCE mfence
2567
2568;;
2569; Alternative for non-SSE2 host.
2570;
2571BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2572 push xAX
2573 xchg xAX, [xSP]
2574 add xSP, xCB
2575 ret
2576ENDPROC iemAImpl_alt_mem_fence
2577
2578
2579;;
2580; Initialize the FPU for the actual instruction being emulated, this means
2581; loading parts of the guest's control word and status word.
2582;
2583; @uses 24 bytes of stack. T0, T1
2584; @param 1 Expression giving the address of the FXSTATE of the guest.
2585;
2586%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2587 fnstenv [xSP]
2588
2589 ; FCW - for exception, precision and rounding control.
2590 movzx T0, word [%1 + X86FXSTATE.FCW]
2591 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2592 mov [xSP + X86FSTENV32P.FCW], T0_16
2593
2594 ; FSW - for undefined C0, C1, C2, and C3.
2595 movzx T1, word [%1 + X86FXSTATE.FSW]
2596 and T1, X86_FSW_C_MASK
2597 movzx T0, word [xSP + X86FSTENV32P.FSW]
2598 and T0, X86_FSW_TOP_MASK
2599 or T0, T1
2600 mov [xSP + X86FSTENV32P.FSW], T0_16
2601
2602 fldenv [xSP]
2603%endmacro
2604
2605
2606;;
2607; Initialize the FPU for the actual instruction being emulated, this means
2608; loading parts of the guest's control word, status word, and update the
2609; tag word for the top register if it's empty.
2610;
2611; ASSUMES actual TOP=7
2612;
2613; @uses 24 bytes of stack. T0, T1
2614; @param 1 Expression giving the address of the FXSTATE of the guest.
2615;
2616%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2617 fnstenv [xSP]
2618
2619 ; FCW - for exception, precision and rounding control.
2620 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2621 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2622 mov [xSP + X86FSTENV32P.FCW], T0_16
2623
2624 ; FSW - for undefined C0, C1, C2, and C3.
2625 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2626 and T1_32, X86_FSW_C_MASK
2627 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2628 and T0_32, X86_FSW_TOP_MASK
2629 or T0_32, T1_32
2630 mov [xSP + X86FSTENV32P.FSW], T0_16
2631
2632 ; FTW - Only for ST0 (in/out).
2633 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2634 shr T1_32, X86_FSW_TOP_SHIFT
2635 and T1_32, X86_FSW_TOP_SMASK
2636 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2637 jc %%st0_not_empty
2638 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2639%%st0_not_empty:
2640
2641 fldenv [xSP]
2642%endmacro
2643
2644
2645;;
2646; Need to move this as well somewhere better?
2647;
2648struc IEMFPURESULT
2649 .r80Result resw 5
2650 .FSW resw 1
2651endstruc
2652
2653
2654;;
2655; Need to move this as well somewhere better?
2656;
2657struc IEMFPURESULTTWO
2658 .r80Result1 resw 5
2659 .FSW resw 1
2660 .r80Result2 resw 5
2661endstruc
2662
2663
2664;
2665;---------------------- 16-bit signed integer operations ----------------------
2666;
2667
2668
2669;;
2670; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2671;
2672; @param A0 FPU context (fxsave).
2673; @param A1 Pointer to a IEMFPURESULT for the output.
2674; @param A2 Pointer to the 16-bit floating point value to convert.
2675;
2676BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2677 PROLOGUE_3_ARGS
2678 sub xSP, 20h
2679
2680 fninit
2681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2682 fild word [A2]
2683
2684 fnstsw word [A1 + IEMFPURESULT.FSW]
2685 fnclex
2686 fstp tword [A1 + IEMFPURESULT.r80Result]
2687
2688 fninit
2689 add xSP, 20h
2690 EPILOGUE_3_ARGS
2691ENDPROC iemAImpl_fild_r80_from_i16
2692
2693
2694;;
2695; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2696;
2697; @param A0 FPU context (fxsave).
2698; @param A1 Where to return the output FSW.
2699; @param A2 Where to store the 16-bit signed integer value.
2700; @param A3 Pointer to the 80-bit value.
2701;
2702BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2703 PROLOGUE_4_ARGS
2704 sub xSP, 20h
2705
2706 fninit
2707 fld tword [A3]
2708 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2709 fistp word [A2]
2710
2711 fnstsw word [A1]
2712
2713 fninit
2714 add xSP, 20h
2715 EPILOGUE_4_ARGS
2716ENDPROC iemAImpl_fist_r80_to_i16
2717
2718
2719;;
2720; Store a 80-bit floating point value (register) as a 16-bit signed integer
2721; (memory) with truncation.
2722;
2723; @param A0 FPU context (fxsave).
2724; @param A1 Where to return the output FSW.
2725; @param A2 Where to store the 16-bit signed integer value.
2726; @param A3 Pointer to the 80-bit value.
2727;
2728BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2729 PROLOGUE_4_ARGS
2730 sub xSP, 20h
2731
2732 fninit
2733 fld tword [A3]
2734 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2735 fisttp word [A2]
2736
2737 fnstsw word [A1]
2738
2739 fninit
2740 add xSP, 20h
2741 EPILOGUE_4_ARGS
2742ENDPROC iemAImpl_fistt_r80_to_i16
2743
2744
2745;;
2746; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2747;
2748; @param 1 The instruction
2749;
2750; @param A0 FPU context (fxsave).
2751; @param A1 Pointer to a IEMFPURESULT for the output.
2752; @param A2 Pointer to the 80-bit value.
2753; @param A3 Pointer to the 16-bit value.
2754;
2755%macro IEMIMPL_FPU_R80_BY_I16 1
2756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2757 PROLOGUE_4_ARGS
2758 sub xSP, 20h
2759
2760 fninit
2761 fld tword [A2]
2762 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2763 %1 word [A3]
2764
2765 fnstsw word [A1 + IEMFPURESULT.FSW]
2766 fnclex
2767 fstp tword [A1 + IEMFPURESULT.r80Result]
2768
2769 fninit
2770 add xSP, 20h
2771 EPILOGUE_4_ARGS
2772ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2773%endmacro
2774
2775IEMIMPL_FPU_R80_BY_I16 fiadd
2776IEMIMPL_FPU_R80_BY_I16 fimul
2777IEMIMPL_FPU_R80_BY_I16 fisub
2778IEMIMPL_FPU_R80_BY_I16 fisubr
2779IEMIMPL_FPU_R80_BY_I16 fidiv
2780IEMIMPL_FPU_R80_BY_I16 fidivr
2781
2782
2783;;
2784; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2785; only returning FSW.
2786;
2787; @param 1 The instruction
2788;
2789; @param A0 FPU context (fxsave).
2790; @param A1 Where to store the output FSW.
2791; @param A2 Pointer to the 80-bit value.
2792; @param A3 Pointer to the 64-bit value.
2793;
2794%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2796 PROLOGUE_4_ARGS
2797 sub xSP, 20h
2798
2799 fninit
2800 fld tword [A2]
2801 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2802 %1 word [A3]
2803
2804 fnstsw word [A1]
2805
2806 fninit
2807 add xSP, 20h
2808 EPILOGUE_4_ARGS
2809ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2810%endmacro
2811
2812IEMIMPL_FPU_R80_BY_I16_FSW ficom
2813
2814
2815
2816;
2817;---------------------- 32-bit signed integer operations ----------------------
2818;
2819
2820
2821;;
2822; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2823;
2824; @param A0 FPU context (fxsave).
2825; @param A1 Pointer to a IEMFPURESULT for the output.
2826; @param A2 Pointer to the 32-bit floating point value to convert.
2827;
2828BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2829 PROLOGUE_3_ARGS
2830 sub xSP, 20h
2831
2832 fninit
2833 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2834 fild dword [A2]
2835
2836 fnstsw word [A1 + IEMFPURESULT.FSW]
2837 fnclex
2838 fstp tword [A1 + IEMFPURESULT.r80Result]
2839
2840 fninit
2841 add xSP, 20h
2842 EPILOGUE_3_ARGS
2843ENDPROC iemAImpl_fild_r80_from_i32
2844
2845
2846;;
2847; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2848;
2849; @param A0 FPU context (fxsave).
2850; @param A1 Where to return the output FSW.
2851; @param A2 Where to store the 32-bit signed integer value.
2852; @param A3 Pointer to the 80-bit value.
2853;
2854BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2855 PROLOGUE_4_ARGS
2856 sub xSP, 20h
2857
2858 fninit
2859 fld tword [A3]
2860 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2861 fistp dword [A2]
2862
2863 fnstsw word [A1]
2864
2865 fninit
2866 add xSP, 20h
2867 EPILOGUE_4_ARGS
2868ENDPROC iemAImpl_fist_r80_to_i32
2869
2870
2871;;
2872; Store a 80-bit floating point value (register) as a 32-bit signed integer
2873; (memory) with truncation.
2874;
2875; @param A0 FPU context (fxsave).
2876; @param A1 Where to return the output FSW.
2877; @param A2 Where to store the 32-bit signed integer value.
2878; @param A3 Pointer to the 80-bit value.
2879;
2880BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2881 PROLOGUE_4_ARGS
2882 sub xSP, 20h
2883
2884 fninit
2885 fld tword [A3]
2886 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2887 fisttp dword [A2]
2888
2889 fnstsw word [A1]
2890
2891 fninit
2892 add xSP, 20h
2893 EPILOGUE_4_ARGS
2894ENDPROC iemAImpl_fistt_r80_to_i32
2895
2896
2897;;
2898; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2899;
2900; @param 1 The instruction
2901;
2902; @param A0 FPU context (fxsave).
2903; @param A1 Pointer to a IEMFPURESULT for the output.
2904; @param A2 Pointer to the 80-bit value.
2905; @param A3 Pointer to the 32-bit value.
2906;
2907%macro IEMIMPL_FPU_R80_BY_I32 1
2908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2909 PROLOGUE_4_ARGS
2910 sub xSP, 20h
2911
2912 fninit
2913 fld tword [A2]
2914 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2915 %1 dword [A3]
2916
2917 fnstsw word [A1 + IEMFPURESULT.FSW]
2918 fnclex
2919 fstp tword [A1 + IEMFPURESULT.r80Result]
2920
2921 fninit
2922 add xSP, 20h
2923 EPILOGUE_4_ARGS
2924ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2925%endmacro
2926
2927IEMIMPL_FPU_R80_BY_I32 fiadd
2928IEMIMPL_FPU_R80_BY_I32 fimul
2929IEMIMPL_FPU_R80_BY_I32 fisub
2930IEMIMPL_FPU_R80_BY_I32 fisubr
2931IEMIMPL_FPU_R80_BY_I32 fidiv
2932IEMIMPL_FPU_R80_BY_I32 fidivr
2933
2934
2935;;
2936; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2937; only returning FSW.
2938;
2939; @param 1 The instruction
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Where to store the output FSW.
2943; @param A2 Pointer to the 80-bit value.
2944; @param A3 Pointer to the 64-bit value.
2945;
2946%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2947BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2948 PROLOGUE_4_ARGS
2949 sub xSP, 20h
2950
2951 fninit
2952 fld tword [A2]
2953 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2954 %1 dword [A3]
2955
2956 fnstsw word [A1]
2957
2958 fninit
2959 add xSP, 20h
2960 EPILOGUE_4_ARGS
2961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2962%endmacro
2963
2964IEMIMPL_FPU_R80_BY_I32_FSW ficom
2965
2966
2967
2968;
2969;---------------------- 64-bit signed integer operations ----------------------
2970;
2971
2972
2973;;
2974; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 64-bit floating point value to convert.
2979;
2980BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2981 PROLOGUE_3_ARGS
2982 sub xSP, 20h
2983
2984 fninit
2985 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2986 fild qword [A2]
2987
2988 fnstsw word [A1 + IEMFPURESULT.FSW]
2989 fnclex
2990 fstp tword [A1 + IEMFPURESULT.r80Result]
2991
2992 fninit
2993 add xSP, 20h
2994 EPILOGUE_3_ARGS
2995ENDPROC iemAImpl_fild_r80_from_i64
2996
2997
2998;;
2999; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3000;
3001; @param A0 FPU context (fxsave).
3002; @param A1 Where to return the output FSW.
3003; @param A2 Where to store the 64-bit signed integer value.
3004; @param A3 Pointer to the 80-bit value.
3005;
3006BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3007 PROLOGUE_4_ARGS
3008 sub xSP, 20h
3009
3010 fninit
3011 fld tword [A3]
3012 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013 fistp qword [A2]
3014
3015 fnstsw word [A1]
3016
3017 fninit
3018 add xSP, 20h
3019 EPILOGUE_4_ARGS
3020ENDPROC iemAImpl_fist_r80_to_i64
3021
3022
3023;;
3024; Store a 80-bit floating point value (register) as a 64-bit signed integer
3025; (memory) with truncation.
3026;
3027; @param A0 FPU context (fxsave).
3028; @param A1 Where to return the output FSW.
3029; @param A2 Where to store the 64-bit signed integer value.
3030; @param A3 Pointer to the 80-bit value.
3031;
3032BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3033 PROLOGUE_4_ARGS
3034 sub xSP, 20h
3035
3036 fninit
3037 fld tword [A3]
3038 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3039 fisttp qword [A2]
3040
3041 fnstsw word [A1]
3042
3043 fninit
3044 add xSP, 20h
3045 EPILOGUE_4_ARGS
3046ENDPROC iemAImpl_fistt_r80_to_i64
3047
3048
3049
3050;
3051;---------------------- 32-bit floating point operations ----------------------
3052;
3053
3054;;
3055; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3056;
3057; @param A0 FPU context (fxsave).
3058; @param A1 Pointer to a IEMFPURESULT for the output.
3059; @param A2 Pointer to the 32-bit floating point value to convert.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3062 PROLOGUE_3_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3067 fld dword [A2]
3068
3069 fnstsw word [A1 + IEMFPURESULT.FSW]
3070 fnclex
3071 fstp tword [A1 + IEMFPURESULT.r80Result]
3072
3073 fninit
3074 add xSP, 20h
3075 EPILOGUE_3_ARGS
3076ENDPROC iemAImpl_fld_r80_from_r32
3077
3078
3079;;
3080; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3081;
3082; @param A0 FPU context (fxsave).
3083; @param A1 Where to return the output FSW.
3084; @param A2 Where to store the 32-bit value.
3085; @param A3 Pointer to the 80-bit value.
3086;
3087BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3088 PROLOGUE_4_ARGS
3089 sub xSP, 20h
3090
3091 fninit
3092 fld tword [A3]
3093 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3094 fst dword [A2]
3095
3096 fnstsw word [A1]
3097
3098 fninit
3099 add xSP, 20h
3100 EPILOGUE_4_ARGS
3101ENDPROC iemAImpl_fst_r80_to_r32
3102
3103
3104;;
3105; FPU instruction working on one 80-bit and one 32-bit floating point value.
3106;
3107; @param 1 The instruction
3108;
3109; @param A0 FPU context (fxsave).
3110; @param A1 Pointer to a IEMFPURESULT for the output.
3111; @param A2 Pointer to the 80-bit value.
3112; @param A3 Pointer to the 32-bit value.
3113;
3114%macro IEMIMPL_FPU_R80_BY_R32 1
3115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3116 PROLOGUE_4_ARGS
3117 sub xSP, 20h
3118
3119 fninit
3120 fld tword [A2]
3121 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3122 %1 dword [A3]
3123
3124 fnstsw word [A1 + IEMFPURESULT.FSW]
3125 fnclex
3126 fstp tword [A1 + IEMFPURESULT.r80Result]
3127
3128 fninit
3129 add xSP, 20h
3130 EPILOGUE_4_ARGS
3131ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3132%endmacro
3133
3134IEMIMPL_FPU_R80_BY_R32 fadd
3135IEMIMPL_FPU_R80_BY_R32 fmul
3136IEMIMPL_FPU_R80_BY_R32 fsub
3137IEMIMPL_FPU_R80_BY_R32 fsubr
3138IEMIMPL_FPU_R80_BY_R32 fdiv
3139IEMIMPL_FPU_R80_BY_R32 fdivr
3140
3141
3142;;
3143; FPU instruction working on one 80-bit and one 32-bit floating point value,
3144; only returning FSW.
3145;
3146; @param 1 The instruction
3147;
3148; @param A0 FPU context (fxsave).
3149; @param A1 Where to store the output FSW.
3150; @param A2 Pointer to the 80-bit value.
3151; @param A3 Pointer to the 64-bit value.
3152;
3153%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3154BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3155 PROLOGUE_4_ARGS
3156 sub xSP, 20h
3157
3158 fninit
3159 fld tword [A2]
3160 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3161 %1 dword [A3]
3162
3163 fnstsw word [A1]
3164
3165 fninit
3166 add xSP, 20h
3167 EPILOGUE_4_ARGS
3168ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3169%endmacro
3170
3171IEMIMPL_FPU_R80_BY_R32_FSW fcom
3172
3173
3174
3175;
3176;---------------------- 64-bit floating point operations ----------------------
3177;
3178
3179;;
3180; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3181;
3182; @param A0 FPU context (fxsave).
3183; @param A1 Pointer to a IEMFPURESULT for the output.
3184; @param A2 Pointer to the 64-bit floating point value to convert.
3185;
3186BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3187 PROLOGUE_3_ARGS
3188 sub xSP, 20h
3189
3190 fninit
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fld qword [A2]
3193
3194 fnstsw word [A1 + IEMFPURESULT.FSW]
3195 fnclex
3196 fstp tword [A1 + IEMFPURESULT.r80Result]
3197
3198 fninit
3199 add xSP, 20h
3200 EPILOGUE_3_ARGS
3201ENDPROC iemAImpl_fld_r80_from_r64
3202
3203
3204;;
3205; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3206;
3207; @param A0 FPU context (fxsave).
3208; @param A1 Where to return the output FSW.
3209; @param A2 Where to store the 64-bit value.
3210; @param A3 Pointer to the 80-bit value.
3211;
3212BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3213 PROLOGUE_4_ARGS
3214 sub xSP, 20h
3215
3216 fninit
3217 fld tword [A3]
3218 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3219 fst qword [A2]
3220
3221 fnstsw word [A1]
3222
3223 fninit
3224 add xSP, 20h
3225 EPILOGUE_4_ARGS
3226ENDPROC iemAImpl_fst_r80_to_r64
3227
3228
3229;;
3230; FPU instruction working on one 80-bit and one 64-bit floating point value.
3231;
3232; @param 1 The instruction
3233;
3234; @param A0 FPU context (fxsave).
3235; @param A1 Pointer to a IEMFPURESULT for the output.
3236; @param A2 Pointer to the 80-bit value.
3237; @param A3 Pointer to the 64-bit value.
3238;
3239%macro IEMIMPL_FPU_R80_BY_R64 1
3240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3241 PROLOGUE_4_ARGS
3242 sub xSP, 20h
3243
3244 fninit
3245 fld tword [A2]
3246 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3247 %1 qword [A3]
3248
3249 fnstsw word [A1 + IEMFPURESULT.FSW]
3250 fnclex
3251 fstp tword [A1 + IEMFPURESULT.r80Result]
3252
3253 fninit
3254 add xSP, 20h
3255 EPILOGUE_4_ARGS
3256ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3257%endmacro
3258
3259IEMIMPL_FPU_R80_BY_R64 fadd
3260IEMIMPL_FPU_R80_BY_R64 fmul
3261IEMIMPL_FPU_R80_BY_R64 fsub
3262IEMIMPL_FPU_R80_BY_R64 fsubr
3263IEMIMPL_FPU_R80_BY_R64 fdiv
3264IEMIMPL_FPU_R80_BY_R64 fdivr
3265
3266;;
3267; FPU instruction working on one 80-bit and one 64-bit floating point value,
3268; only returning FSW.
3269;
3270; @param 1 The instruction
3271;
3272; @param A0 FPU context (fxsave).
3273; @param A1 Where to store the output FSW.
3274; @param A2 Pointer to the 80-bit value.
3275; @param A3 Pointer to the 64-bit value.
3276;
3277%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3279 PROLOGUE_4_ARGS
3280 sub xSP, 20h
3281
3282 fninit
3283 fld tword [A2]
3284 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3285 %1 qword [A3]
3286
3287 fnstsw word [A1]
3288
3289 fninit
3290 add xSP, 20h
3291 EPILOGUE_4_ARGS
3292ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3293%endmacro
3294
3295IEMIMPL_FPU_R80_BY_R64_FSW fcom
3296
3297
3298
3299;
3300;---------------------- 80-bit floating point operations ----------------------
3301;
3302
3303;;
3304; Loads a 80-bit floating point register value from memory.
3305;
3306; @param A0 FPU context (fxsave).
3307; @param A1 Pointer to a IEMFPURESULT for the output.
3308; @param A2 Pointer to the 80-bit floating point value to load.
3309;
3310BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3311 PROLOGUE_3_ARGS
3312 sub xSP, 20h
3313
3314 fninit
3315 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3316 fld tword [A2]
3317
3318 fnstsw word [A1 + IEMFPURESULT.FSW]
3319 fnclex
3320 fstp tword [A1 + IEMFPURESULT.r80Result]
3321
3322 fninit
3323 add xSP, 20h
3324 EPILOGUE_3_ARGS
3325ENDPROC iemAImpl_fld_r80_from_r80
3326
3327
3328;;
3329; Store a 80-bit floating point register to memory
3330;
3331; @param A0 FPU context (fxsave).
3332; @param A1 Where to return the output FSW.
3333; @param A2 Where to store the 80-bit value.
3334; @param A3 Pointer to the 80-bit register value.
3335;
3336BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3337 PROLOGUE_4_ARGS
3338 sub xSP, 20h
3339
3340 fninit
3341 fld tword [A3]
3342 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3343 fstp tword [A2]
3344
3345 fnstsw word [A1]
3346
3347 fninit
3348 add xSP, 20h
3349 EPILOGUE_4_ARGS
3350ENDPROC iemAImpl_fst_r80_to_r80
3351
3352
3353;;
3354; Loads an 80-bit floating point register value in BCD format from memory.
3355;
3356; @param A0 FPU context (fxsave).
3357; @param A1 Pointer to a IEMFPURESULT for the output.
3358; @param A2 Pointer to the 80-bit BCD value to load.
3359;
3360BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3361 PROLOGUE_3_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3366 fbld tword [A2]
3367
3368 fnstsw word [A1 + IEMFPURESULT.FSW]
3369 fnclex
3370 fstp tword [A1 + IEMFPURESULT.r80Result]
3371
3372 fninit
3373 add xSP, 20h
3374 EPILOGUE_3_ARGS
3375ENDPROC iemAImpl_fld_r80_from_d80
3376
3377
3378;;
3379; Store a 80-bit floating point register to memory as BCD
3380;
3381; @param A0 FPU context (fxsave).
3382; @param A1 Where to return the output FSW.
3383; @param A2 Where to store the 80-bit BCD value.
3384; @param A3 Pointer to the 80-bit register value.
3385;
3386BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3387 PROLOGUE_4_ARGS
3388 sub xSP, 20h
3389
3390 fninit
3391 fld tword [A3]
3392 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3393 fbstp tword [A2]
3394
3395 fnstsw word [A1]
3396
3397 fninit
3398 add xSP, 20h
3399 EPILOGUE_4_ARGS
3400ENDPROC iemAImpl_fst_r80_to_d80
3401
3402
3403;;
3404; FPU instruction working on two 80-bit floating point values.
3405;
3406; @param 1 The instruction
3407;
3408; @param A0 FPU context (fxsave).
3409; @param A1 Pointer to a IEMFPURESULT for the output.
3410; @param A2 Pointer to the first 80-bit value (ST0)
3411; @param A3 Pointer to the second 80-bit value (STn).
3412;
3413%macro IEMIMPL_FPU_R80_BY_R80 2
3414BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3415 PROLOGUE_4_ARGS
3416 sub xSP, 20h
3417
3418 fninit
3419 fld tword [A3]
3420 fld tword [A2]
3421 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3422 %1 %2
3423
3424 fnstsw word [A1 + IEMFPURESULT.FSW]
3425 fnclex
3426 fstp tword [A1 + IEMFPURESULT.r80Result]
3427
3428 fninit
3429 add xSP, 20h
3430 EPILOGUE_4_ARGS
3431ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3432%endmacro
3433
3434IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3435IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3436IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3437IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3438IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3439IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3440IEMIMPL_FPU_R80_BY_R80 fprem, {}
3441IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3442IEMIMPL_FPU_R80_BY_R80 fscale, {}
3443
3444
3445;;
3446; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3447; storing the result in ST1 and popping the stack.
3448;
3449; @param 1 The instruction
3450;
3451; @param A0 FPU context (fxsave).
3452; @param A1 Pointer to a IEMFPURESULT for the output.
3453; @param A2 Pointer to the first 80-bit value (ST1).
3454; @param A3 Pointer to the second 80-bit value (ST0).
3455;
3456%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3458 PROLOGUE_4_ARGS
3459 sub xSP, 20h
3460
3461 fninit
3462 fld tword [A2]
3463 fld tword [A3]
3464 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3465 %1
3466
3467 fnstsw word [A1 + IEMFPURESULT.FSW]
3468 fnclex
3469 fstp tword [A1 + IEMFPURESULT.r80Result]
3470
3471 fninit
3472 add xSP, 20h
3473 EPILOGUE_4_ARGS
3474ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3475%endmacro
3476
3477IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3478IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3479IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3480
3481
3482;;
3483; FPU instruction working on two 80-bit floating point values, only
3484; returning FSW.
3485;
3486; @param 1 The instruction
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Pointer to a uint16_t for the resulting FSW.
3490; @param A2 Pointer to the first 80-bit value.
3491; @param A3 Pointer to the second 80-bit value.
3492;
3493%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3495 PROLOGUE_4_ARGS
3496 sub xSP, 20h
3497
3498 fninit
3499 fld tword [A3]
3500 fld tword [A2]
3501 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3502 %1 st0, st1
3503
3504 fnstsw word [A1]
3505
3506 fninit
3507 add xSP, 20h
3508 EPILOGUE_4_ARGS
3509ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3510%endmacro
3511
3512IEMIMPL_FPU_R80_BY_R80_FSW fcom
3513IEMIMPL_FPU_R80_BY_R80_FSW fucom
3514
3515
3516;;
3517; FPU instruction working on two 80-bit floating point values,
3518; returning FSW and EFLAGS (eax).
3519;
3520; @param 1 The instruction
3521;
3522; @returns EFLAGS in EAX.
3523; @param A0 FPU context (fxsave).
3524; @param A1 Pointer to a uint16_t for the resulting FSW.
3525; @param A2 Pointer to the first 80-bit value.
3526; @param A3 Pointer to the second 80-bit value.
3527;
3528%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3529BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3530 PROLOGUE_4_ARGS
3531 sub xSP, 20h
3532
3533 fninit
3534 fld tword [A3]
3535 fld tword [A2]
3536 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3537 %1 st1
3538
3539 fnstsw word [A1]
3540 pushf
3541 pop xAX
3542
3543 fninit
3544 add xSP, 20h
3545 EPILOGUE_4_ARGS
3546ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3547%endmacro
3548
3549IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3550IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3551
3552
3553;;
3554; FPU instruction working on one 80-bit floating point value.
3555;
3556; @param 1 The instruction
3557;
3558; @param A0 FPU context (fxsave).
3559; @param A1 Pointer to a IEMFPURESULT for the output.
3560; @param A2 Pointer to the 80-bit value.
3561;
3562%macro IEMIMPL_FPU_R80 1
3563BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3564 PROLOGUE_3_ARGS
3565 sub xSP, 20h
3566
3567 fninit
3568 fld tword [A2]
3569 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3570 %1
3571
3572 fnstsw word [A1 + IEMFPURESULT.FSW]
3573 fnclex
3574 fstp tword [A1 + IEMFPURESULT.r80Result]
3575
3576 fninit
3577 add xSP, 20h
3578 EPILOGUE_3_ARGS
3579ENDPROC iemAImpl_ %+ %1 %+ _r80
3580%endmacro
3581
3582IEMIMPL_FPU_R80 fchs
3583IEMIMPL_FPU_R80 fabs
3584IEMIMPL_FPU_R80 f2xm1
3585IEMIMPL_FPU_R80 fsqrt
3586IEMIMPL_FPU_R80 frndint
3587IEMIMPL_FPU_R80 fsin
3588IEMIMPL_FPU_R80 fcos
3589
3590
3591;;
3592; FPU instruction working on one 80-bit floating point value, only
3593; returning FSW.
3594;
3595; @param 1 The instruction
3596; @param 2 Non-zero to also restore FTW.
3597;
3598; @param A0 FPU context (fxsave).
3599; @param A1 Pointer to a uint16_t for the resulting FSW.
3600; @param A2 Pointer to the 80-bit value.
3601;
3602%macro IEMIMPL_FPU_R80_FSW 2
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3604 PROLOGUE_3_ARGS
3605 sub xSP, 20h
3606
3607 fninit
3608 fld tword [A2]
3609%if %2 != 0
3610 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3611%else
3612 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3613%endif
3614 %1
3615
3616 fnstsw word [A1]
3617
3618 fninit
3619 add xSP, 20h
3620 EPILOGUE_3_ARGS
3621ENDPROC iemAImpl_ %+ %1 %+ _r80
3622%endmacro
3623
3624IEMIMPL_FPU_R80_FSW ftst, 0
3625IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3626
3627
3628
3629;;
3630; FPU instruction loading a 80-bit floating point constant.
3631;
3632; @param 1 The instruction
3633;
3634; @param A0 FPU context (fxsave).
3635; @param A1 Pointer to a IEMFPURESULT for the output.
3636;
3637%macro IEMIMPL_FPU_R80_CONST 1
3638BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3639 PROLOGUE_2_ARGS
3640 sub xSP, 20h
3641
3642 fninit
3643 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3644 %1
3645
3646 fnstsw word [A1 + IEMFPURESULT.FSW]
3647 fnclex
3648 fstp tword [A1 + IEMFPURESULT.r80Result]
3649
3650 fninit
3651 add xSP, 20h
3652 EPILOGUE_2_ARGS
3653ENDPROC iemAImpl_ %+ %1 %+
3654%endmacro
3655
3656IEMIMPL_FPU_R80_CONST fld1
3657IEMIMPL_FPU_R80_CONST fldl2t
3658IEMIMPL_FPU_R80_CONST fldl2e
3659IEMIMPL_FPU_R80_CONST fldpi
3660IEMIMPL_FPU_R80_CONST fldlg2
3661IEMIMPL_FPU_R80_CONST fldln2
3662IEMIMPL_FPU_R80_CONST fldz
3663
3664
3665;;
3666; FPU instruction working on one 80-bit floating point value, outputing two.
3667;
3668; @param 1 The instruction
3669;
3670; @param A0 FPU context (fxsave).
3671; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3672; @param A2 Pointer to the 80-bit value.
3673;
3674%macro IEMIMPL_FPU_R80_R80 1
3675BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3676 PROLOGUE_3_ARGS
3677 sub xSP, 20h
3678
3679 fninit
3680 fld tword [A2]
3681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3682 %1
3683
3684 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3685 fnclex
3686 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3694%endmacro
3695
3696IEMIMPL_FPU_R80_R80 fptan
3697IEMIMPL_FPU_R80_R80 fxtract
3698IEMIMPL_FPU_R80_R80 fsincos
3699
3700
3701
3702
3703;---------------------- SSE and MMX Operations ----------------------
3704
3705;; @todo what do we need to do for MMX?
3706%macro IEMIMPL_MMX_PROLOGUE 0
3707%endmacro
3708%macro IEMIMPL_MMX_EPILOGUE 0
3709%endmacro
3710
3711;; @todo what do we need to do for SSE?
3712%macro IEMIMPL_SSE_PROLOGUE 0
3713%endmacro
3714%macro IEMIMPL_SSE_EPILOGUE 0
3715%endmacro
3716
3717;; @todo what do we need to do for AVX?
3718%macro IEMIMPL_AVX_PROLOGUE 0
3719%endmacro
3720%macro IEMIMPL_AVX_EPILOGUE 0
3721%endmacro
3722
3723
3724;;
3725; Media instruction working on two full sized registers.
3726;
3727; @param 1 The instruction
3728; @param 2 Whether there is an MMX variant (1) or not (0).
3729;
3730; @param A0 FPU context (fxsave).
3731; @param A1 Pointer to the first media register size operand (input/output).
3732; @param A2 Pointer to the second media register size operand (input).
3733;
3734%macro IEMIMPL_MEDIA_F2 2
3735%if %2 != 0
3736BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3737 PROLOGUE_3_ARGS
3738 IEMIMPL_MMX_PROLOGUE
3739
3740 movq mm0, [A1]
3741 movq mm1, [A2]
3742 %1 mm0, mm1
3743 movq [A1], mm0
3744
3745 IEMIMPL_MMX_EPILOGUE
3746 EPILOGUE_3_ARGS
3747ENDPROC iemAImpl_ %+ %1 %+ _u64
3748%endif
3749
3750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3751 PROLOGUE_3_ARGS
3752 IEMIMPL_SSE_PROLOGUE
3753
3754 movdqu xmm0, [A1]
3755 movdqu xmm1, [A2]
3756 %1 xmm0, xmm1
3757 movdqu [A1], xmm0
3758
3759 IEMIMPL_SSE_EPILOGUE
3760 EPILOGUE_3_ARGS
3761ENDPROC iemAImpl_ %+ %1 %+ _u128
3762%endmacro
3763
3764IEMIMPL_MEDIA_F2 pshufb, 1
3765IEMIMPL_MEDIA_F2 pand, 1
3766IEMIMPL_MEDIA_F2 pandn, 1
3767IEMIMPL_MEDIA_F2 por, 1
3768IEMIMPL_MEDIA_F2 pxor, 1
3769IEMIMPL_MEDIA_F2 pcmpeqb, 1
3770IEMIMPL_MEDIA_F2 pcmpeqw, 1
3771IEMIMPL_MEDIA_F2 pcmpeqd, 1
3772IEMIMPL_MEDIA_F2 pcmpeqq, 0
3773IEMIMPL_MEDIA_F2 pcmpgtb, 1
3774IEMIMPL_MEDIA_F2 pcmpgtw, 1
3775IEMIMPL_MEDIA_F2 pcmpgtd, 1
3776IEMIMPL_MEDIA_F2 pcmpgtq, 0
3777IEMIMPL_MEDIA_F2 paddb, 1
3778IEMIMPL_MEDIA_F2 paddw, 1
3779IEMIMPL_MEDIA_F2 paddd, 1
3780IEMIMPL_MEDIA_F2 paddq, 1
3781IEMIMPL_MEDIA_F2 paddsb, 1
3782IEMIMPL_MEDIA_F2 paddsw, 1
3783IEMIMPL_MEDIA_F2 paddusb, 1
3784IEMIMPL_MEDIA_F2 paddusw, 1
3785IEMIMPL_MEDIA_F2 psubb, 1
3786IEMIMPL_MEDIA_F2 psubw, 1
3787IEMIMPL_MEDIA_F2 psubd, 1
3788IEMIMPL_MEDIA_F2 psubq, 1
3789IEMIMPL_MEDIA_F2 psubsb, 1
3790IEMIMPL_MEDIA_F2 psubsw, 1
3791IEMIMPL_MEDIA_F2 psubusb, 1
3792IEMIMPL_MEDIA_F2 psubusw, 1
3793IEMIMPL_MEDIA_F2 pmullw, 1
3794IEMIMPL_MEDIA_F2 pmulld, 0
3795IEMIMPL_MEDIA_F2 pmulhw, 1
3796IEMIMPL_MEDIA_F2 pmaddwd, 1
3797IEMIMPL_MEDIA_F2 pminub, 1
3798IEMIMPL_MEDIA_F2 pminuw, 0
3799IEMIMPL_MEDIA_F2 pminud, 0
3800IEMIMPL_MEDIA_F2 pminsb, 0
3801IEMIMPL_MEDIA_F2 pminsw, 1
3802IEMIMPL_MEDIA_F2 pminsd, 0
3803IEMIMPL_MEDIA_F2 pmaxub, 1
3804IEMIMPL_MEDIA_F2 pmaxuw, 0
3805IEMIMPL_MEDIA_F2 pmaxud, 0
3806IEMIMPL_MEDIA_F2 pmaxsb, 0
3807IEMIMPL_MEDIA_F2 pmaxsw, 1
3808IEMIMPL_MEDIA_F2 pmaxsd, 0
3809IEMIMPL_MEDIA_F2 pabsb, 1
3810IEMIMPL_MEDIA_F2 pabsw, 1
3811IEMIMPL_MEDIA_F2 pabsd, 1
3812IEMIMPL_MEDIA_F2 psignb, 1
3813IEMIMPL_MEDIA_F2 psignw, 1
3814IEMIMPL_MEDIA_F2 psignd, 1
3815IEMIMPL_MEDIA_F2 phaddw, 1
3816IEMIMPL_MEDIA_F2 phaddd, 1
3817IEMIMPL_MEDIA_F2 phsubw, 1
3818IEMIMPL_MEDIA_F2 phsubd, 1
3819IEMIMPL_MEDIA_F2 phaddsw, 1
3820IEMIMPL_MEDIA_F2 phsubsw, 1
3821IEMIMPL_MEDIA_F2 pmaddubsw, 1
3822IEMIMPL_MEDIA_F2 pmulhrsw, 1
3823IEMIMPL_MEDIA_F2 pmuludq, 1
3824
3825
3826;;
3827; Media instruction working on two full sized registers, but no FXSAVE state argument.
3828;
3829; @param 1 The instruction
3830; @param 2 Whether there is an MMX variant (1) or not (0).
3831;
3832; @param A0 Pointer to the first media register size operand (input/output).
3833; @param A1 Pointer to the second media register size operand (input).
3834;
3835%macro IEMIMPL_MEDIA_OPT_F2 2
3836%if %2 != 0
3837BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3838 PROLOGUE_2_ARGS
3839 IEMIMPL_MMX_PROLOGUE
3840
3841 movq mm0, [A0]
3842 movq mm1, [A1]
3843 %1 mm0, mm1
3844 movq [A0], mm0
3845
3846 IEMIMPL_MMX_EPILOGUE
3847 EPILOGUE_2_ARGS
3848ENDPROC iemAImpl_ %+ %1 %+ _u64
3849%endif
3850
3851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3852 PROLOGUE_2_ARGS
3853 IEMIMPL_SSE_PROLOGUE
3854
3855 movdqu xmm0, [A0]
3856 movdqu xmm1, [A1]
3857 %1 xmm0, xmm1
3858 movdqu [A0], xmm0
3859
3860 IEMIMPL_SSE_EPILOGUE
3861 EPILOGUE_2_ARGS
3862ENDPROC iemAImpl_ %+ %1 %+ _u128
3863%endmacro
3864
3865IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3866IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3867IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3868IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3869IEMIMPL_MEDIA_OPT_F2 psllw, 1
3870IEMIMPL_MEDIA_OPT_F2 pslld, 1
3871IEMIMPL_MEDIA_OPT_F2 psllq, 1
3872IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3873IEMIMPL_MEDIA_OPT_F2 psrld, 1
3874IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3875IEMIMPL_MEDIA_OPT_F2 psraw, 1
3876IEMIMPL_MEDIA_OPT_F2 psrad, 1
3877IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3878IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3879IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3880IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3881IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3882IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3883IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3884IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3885IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3886IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3887IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3888IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3889IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3890IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3891IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3892IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3893IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3894IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3895IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3896IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3897
3898;;
3899; Media instruction working on one full sized and one half sized register (lower half).
3900;
3901; @param 1 The instruction
3902; @param 2 1 if MMX is included, 0 if not.
3903;
3904; @param A0 Pointer to the first full sized media register operand (input/output).
3905; @param A1 Pointer to the second half sized media register operand (input).
3906;
3907%macro IEMIMPL_MEDIA_F1L1 2
3908 %if %2 != 0
3909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3910 PROLOGUE_2_ARGS
3911 IEMIMPL_MMX_PROLOGUE
3912
3913 movq mm0, [A0]
3914 movq mm1, [A1]
3915 %1 mm0, mm1
3916 movq [A0], mm0
3917
3918 IEMIMPL_MMX_EPILOGUE
3919 EPILOGUE_2_ARGS
3920ENDPROC iemAImpl_ %+ %1 %+ _u64
3921 %endif
3922
3923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3924 PROLOGUE_2_ARGS
3925 IEMIMPL_SSE_PROLOGUE
3926
3927 movdqu xmm0, [A0]
3928 movdqu xmm1, [A1]
3929 %1 xmm0, xmm1
3930 movdqu [A0], xmm0
3931
3932 IEMIMPL_SSE_EPILOGUE
3933 EPILOGUE_2_ARGS
3934ENDPROC iemAImpl_ %+ %1 %+ _u128
3935%endmacro
3936
3937IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3938IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3939IEMIMPL_MEDIA_F1L1 punpckldq, 1
3940IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3941
3942
3943;;
3944; Media instruction working two half sized input registers (lower half) and a full sized
3945; destination register (vpunpckh*).
3946;
3947; @param 1 The instruction
3948;
3949; @param A0 Pointer to the destination register (full sized, output only).
3950; @param A1 Pointer to the first full sized media source register operand, where we
3951; will only use the lower half as input - but we'll be loading it in full.
3952; @param A2 Pointer to the second full sized media source register operand, where we
3953; will only use the lower half as input - but we'll be loading it in full.
3954;
3955%macro IEMIMPL_MEDIA_F1L1L1 1
3956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3957 PROLOGUE_3_ARGS
3958 IEMIMPL_AVX_PROLOGUE
3959
3960 vmovdqu xmm0, [A1]
3961 vmovdqu xmm1, [A2]
3962 %1 xmm0, xmm0, xmm1
3963 vmovdqu [A0], xmm0
3964
3965 IEMIMPL_AVX_PROLOGUE
3966 EPILOGUE_3_ARGS
3967ENDPROC iemAImpl_ %+ %1 %+ _u128
3968
3969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3970 PROLOGUE_3_ARGS
3971 IEMIMPL_AVX_PROLOGUE
3972
3973 vmovdqu ymm0, [A1]
3974 vmovdqu ymm1, [A2]
3975 %1 ymm0, ymm0, ymm1
3976 vmovdqu [A0], ymm0
3977
3978 IEMIMPL_AVX_PROLOGUE
3979 EPILOGUE_3_ARGS
3980ENDPROC iemAImpl_ %+ %1 %+ _u256
3981%endmacro
3982
3983IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3984IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3985IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3986IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3987
3988
3989;;
3990; Media instruction working on one full sized and one half sized register (high half).
3991;
3992; @param 1 The instruction
3993; @param 2 1 if MMX is included, 0 if not.
3994;
3995; @param A0 Pointer to the first full sized media register operand (input/output).
3996; @param A1 Pointer to the second full sized media register operand, where we
3997; will only use the upper half as input - but we'll load it in full.
3998;
3999%macro IEMIMPL_MEDIA_F1H1 2
4000IEMIMPL_MEDIA_F1L1 %1, %2
4001%endmacro
4002
4003IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4004IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4005IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4006IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4007
4008
4009;;
4010; Media instruction working two half sized input registers (high half) and a full sized
4011; destination register (vpunpckh*).
4012;
4013; @param 1 The instruction
4014;
4015; @param A0 Pointer to the destination register (full sized, output only).
4016; @param A1 Pointer to the first full sized media source register operand, where we
4017; will only use the upper half as input - but we'll be loading it in full.
4018; @param A2 Pointer to the second full sized media source register operand, where we
4019; will only use the upper half as input - but we'll be loading it in full.
4020;
4021%macro IEMIMPL_MEDIA_F1H1H1 1
4022IEMIMPL_MEDIA_F1L1L1 %1
4023%endmacro
4024
4025IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4026IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4027IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4028IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4029
4030
4031;
4032; Shufflers with evil 8-bit immediates.
4033;
4034
4035BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4036 PROLOGUE_3_ARGS
4037 IEMIMPL_MMX_PROLOGUE
4038
4039 movzx A2, A2_8 ; must clear top bits
4040 movq mm1, [A1]
4041 movq mm0, mm0 ; paranoia!
4042 lea T1, [.imm0 xWrtRIP]
4043 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4044 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4045 %else
4046 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4047 %endif
4048 lea T1, [T1 + T0]
4049 IBT_NOTRACK
4050 call T1
4051 movq [A0], mm0
4052
4053 IEMIMPL_MMX_EPILOGUE
4054 EPILOGUE_3_ARGS
4055%assign bImm 0
4056%rep 256
4057.imm %+ bImm:
4058 IBT_ENDBRxx_WITHOUT_NOTRACK
4059 pshufw mm0, mm1, bImm
4060 ret
4061 %assign bImm bImm + 1
4062%endrep
4063.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4064ENDPROC iemAImpl_pshufw_u64
4065
4066
4067%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4069 PROLOGUE_3_ARGS
4070 IEMIMPL_SSE_PROLOGUE
4071
4072 movzx A2, A2_8 ; must clear top bits
4073 movdqu xmm1, [A1]
4074 movdqu xmm0, xmm1 ; paranoia!
4075 lea T1, [.imm0 xWrtRIP]
4076 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4077 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4078 %else
4079 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4080 %endif
4081 lea T1, [T1 + T0*2]
4082 IBT_NOTRACK
4083 call T1
4084 movdqu [A0], xmm0
4085
4086 IEMIMPL_SSE_EPILOGUE
4087 EPILOGUE_3_ARGS
4088
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, xmm1, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4102IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4103IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4104
4105
4106%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4108 PROLOGUE_3_ARGS
4109 IEMIMPL_SSE_PROLOGUE
4110
4111 movzx A2, A2_8 ; must clear top bits
4112 vmovdqu ymm1, [A1]
4113 vmovdqu ymm0, ymm1 ; paranoia!
4114 lea T1, [.imm0 xWrtRIP]
4115 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4116 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4117 %else
4118 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4119 %endif
4120 lea T1, [T1 + T0*2]
4121 IBT_NOTRACK
4122 call T1
4123 vmovdqu [A0], ymm0
4124
4125 IEMIMPL_SSE_EPILOGUE
4126 EPILOGUE_3_ARGS
4127 %assign bImm 0
4128 %rep 256
4129.imm %+ bImm:
4130 IBT_ENDBRxx_WITHOUT_NOTRACK
4131 %1 ymm0, ymm1, bImm
4132 ret
4133 %assign bImm bImm + 1
4134 %endrep
4135.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4136ENDPROC iemAImpl_ %+ %1 %+ _u256
4137%endmacro
4138
4139IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4140IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4141IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4142
4143
4144;
4145; Shifts with evil 8-bit immediates.
4146;
4147
4148%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4149BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4150 PROLOGUE_2_ARGS
4151 IEMIMPL_MMX_PROLOGUE
4152
4153 movzx A1, A1_8 ; must clear top bits
4154 movq mm0, [A0]
4155 lea T1, [.imm0 xWrtRIP]
4156 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4157 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4158 %else
4159 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4160 %endif
4161 lea T1, [T1 + T0]
4162 IBT_NOTRACK
4163 call T1
4164 movq [A0], mm0
4165
4166 IEMIMPL_MMX_EPILOGUE
4167 EPILOGUE_2_ARGS
4168%assign bImm 0
4169%rep 256
4170.imm %+ bImm:
4171 IBT_ENDBRxx_WITHOUT_NOTRACK
4172 %1 mm0, bImm
4173 ret
4174 %assign bImm bImm + 1
4175%endrep
4176.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4177ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4178%endmacro
4179
4180IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4181IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4182IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4183IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4184IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4185IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4186IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4187IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4188
4189
4190%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4192 PROLOGUE_2_ARGS
4193 IEMIMPL_SSE_PROLOGUE
4194
4195 movzx A1, A1_8 ; must clear top bits
4196 movdqu xmm0, [A0]
4197 lea T1, [.imm0 xWrtRIP]
4198 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4199 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4200 %else
4201 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4202 %endif
4203 lea T1, [T1 + T0*2]
4204 IBT_NOTRACK
4205 call T1
4206 movdqu [A0], xmm0
4207
4208 IEMIMPL_SSE_EPILOGUE
4209 EPILOGUE_2_ARGS
4210 %assign bImm 0
4211 %rep 256
4212.imm %+ bImm:
4213 IBT_ENDBRxx_WITHOUT_NOTRACK
4214 %1 xmm0, bImm
4215 ret
4216 %assign bImm bImm + 1
4217 %endrep
4218.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4219ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4220%endmacro
4221
4222IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4223IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4224IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4225IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4226IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4227IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4228IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4229IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4230IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4231IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4232
4233
4234;
4235; Move byte mask.
4236;
4237
4238BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4239 PROLOGUE_2_ARGS
4240 IEMIMPL_MMX_PROLOGUE
4241
4242 movq mm1, [A1]
4243 pmovmskb T0, mm1
4244 mov [A0], T0
4245%ifdef RT_ARCH_X86
4246 mov dword [A0 + 4], 0
4247%endif
4248 IEMIMPL_MMX_EPILOGUE
4249 EPILOGUE_2_ARGS
4250ENDPROC iemAImpl_pmovmskb_u64
4251
4252BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4253 PROLOGUE_2_ARGS
4254 IEMIMPL_SSE_PROLOGUE
4255
4256 movdqu xmm1, [A1]
4257 pmovmskb T0, xmm1
4258 mov [A0], T0
4259%ifdef RT_ARCH_X86
4260 mov dword [A0 + 4], 0
4261%endif
4262 IEMIMPL_SSE_EPILOGUE
4263 EPILOGUE_2_ARGS
4264ENDPROC iemAImpl_pmovmskb_u128
4265
4266BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4267 PROLOGUE_2_ARGS
4268 IEMIMPL_AVX_PROLOGUE
4269
4270 vmovdqu ymm1, [A1]
4271 vpmovmskb T0, ymm1
4272 mov [A0], T0
4273%ifdef RT_ARCH_X86
4274 mov dword [A0 + 4], 0
4275%endif
4276 IEMIMPL_AVX_EPILOGUE
4277 EPILOGUE_2_ARGS
4278ENDPROC iemAImpl_vpmovmskb_u256
4279
4280
4281;;
4282; Media instruction working on two full sized source registers and one destination (AVX).
4283;
4284; @param 1 The instruction
4285;
4286; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4287; @param A1 Pointer to the destination media register size operand (output).
4288; @param A2 Pointer to the first source media register size operand (input).
4289; @param A3 Pointer to the second source media register size operand (input).
4290;
4291%macro IEMIMPL_MEDIA_F3 1
4292BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4293 PROLOGUE_4_ARGS
4294 IEMIMPL_AVX_PROLOGUE
4295
4296 vmovdqu xmm0, [A2]
4297 vmovdqu xmm1, [A3]
4298 %1 xmm0, xmm0, xmm1
4299 vmovdqu [A1], xmm0
4300
4301 IEMIMPL_AVX_PROLOGUE
4302 EPILOGUE_4_ARGS
4303ENDPROC iemAImpl_ %+ %1 %+ _u128
4304
4305BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4306 PROLOGUE_4_ARGS
4307 IEMIMPL_AVX_PROLOGUE
4308
4309 vmovdqu ymm0, [A2]
4310 vmovdqu ymm1, [A3]
4311 %1 ymm0, ymm0, ymm1
4312 vmovdqu [A1], ymm0
4313
4314 IEMIMPL_AVX_PROLOGUE
4315 EPILOGUE_4_ARGS
4316ENDPROC iemAImpl_ %+ %1 %+ _u256
4317%endmacro
4318
4319IEMIMPL_MEDIA_F3 vpshufb
4320IEMIMPL_MEDIA_F3 vpand
4321IEMIMPL_MEDIA_F3 vpminub
4322IEMIMPL_MEDIA_F3 vpminuw
4323IEMIMPL_MEDIA_F3 vpminud
4324IEMIMPL_MEDIA_F3 vpminsb
4325IEMIMPL_MEDIA_F3 vpminsw
4326IEMIMPL_MEDIA_F3 vpminsd
4327IEMIMPL_MEDIA_F3 vpmaxub
4328IEMIMPL_MEDIA_F3 vpmaxuw
4329IEMIMPL_MEDIA_F3 vpmaxud
4330IEMIMPL_MEDIA_F3 vpmaxsb
4331IEMIMPL_MEDIA_F3 vpmaxsw
4332IEMIMPL_MEDIA_F3 vpmaxsd
4333IEMIMPL_MEDIA_F3 vpandn
4334IEMIMPL_MEDIA_F3 vpor
4335IEMIMPL_MEDIA_F3 vpxor
4336IEMIMPL_MEDIA_F3 vpcmpeqb
4337IEMIMPL_MEDIA_F3 vpcmpeqw
4338IEMIMPL_MEDIA_F3 vpcmpeqd
4339IEMIMPL_MEDIA_F3 vpcmpeqq
4340IEMIMPL_MEDIA_F3 vpcmpgtb
4341IEMIMPL_MEDIA_F3 vpcmpgtw
4342IEMIMPL_MEDIA_F3 vpcmpgtd
4343IEMIMPL_MEDIA_F3 vpcmpgtq
4344IEMIMPL_MEDIA_F3 vpaddb
4345IEMIMPL_MEDIA_F3 vpaddw
4346IEMIMPL_MEDIA_F3 vpaddd
4347IEMIMPL_MEDIA_F3 vpaddq
4348IEMIMPL_MEDIA_F3 vpsubb
4349IEMIMPL_MEDIA_F3 vpsubw
4350IEMIMPL_MEDIA_F3 vpsubd
4351IEMIMPL_MEDIA_F3 vpsubq
4352
4353
4354;;
4355; Media instruction working on two full sized source registers and one destination (AVX),
4356; but no XSAVE state pointer argument.
4357;
4358; @param 1 The instruction
4359;
4360; @param A0 Pointer to the destination media register size operand (output).
4361; @param A1 Pointer to the first source media register size operand (input).
4362; @param A2 Pointer to the second source media register size operand (input).
4363;
4364%macro IEMIMPL_MEDIA_OPT_F3 1
4365BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4366 PROLOGUE_3_ARGS
4367 IEMIMPL_AVX_PROLOGUE
4368
4369 vmovdqu xmm0, [A1]
4370 vmovdqu xmm1, [A2]
4371 %1 xmm0, xmm0, xmm1
4372 vmovdqu [A0], xmm0
4373
4374 IEMIMPL_AVX_PROLOGUE
4375 EPILOGUE_3_ARGS
4376ENDPROC iemAImpl_ %+ %1 %+ _u128
4377
4378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4379 PROLOGUE_3_ARGS
4380 IEMIMPL_AVX_PROLOGUE
4381
4382 vmovdqu ymm0, [A1]
4383 vmovdqu ymm1, [A2]
4384 %1 ymm0, ymm0, ymm1
4385 vmovdqu [A0], ymm0
4386
4387 IEMIMPL_AVX_PROLOGUE
4388 EPILOGUE_3_ARGS
4389ENDPROC iemAImpl_ %+ %1 %+ _u256
4390%endmacro
4391
4392IEMIMPL_MEDIA_OPT_F3 vpacksswb
4393IEMIMPL_MEDIA_OPT_F3 vpackssdw
4394IEMIMPL_MEDIA_OPT_F3 vpackuswb
4395IEMIMPL_MEDIA_OPT_F3 vpackusdw
4396IEMIMPL_MEDIA_OPT_F3 vpmullw
4397IEMIMPL_MEDIA_OPT_F3 vpmulld
4398IEMIMPL_MEDIA_OPT_F3 vpmulhw
4399IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4400IEMIMPL_MEDIA_OPT_F3 vpavgb
4401IEMIMPL_MEDIA_OPT_F3 vpavgw
4402IEMIMPL_MEDIA_OPT_F3 vpsignb
4403IEMIMPL_MEDIA_OPT_F3 vpsignw
4404IEMIMPL_MEDIA_OPT_F3 vpsignd
4405IEMIMPL_MEDIA_OPT_F3 vphaddw
4406IEMIMPL_MEDIA_OPT_F3 vphaddd
4407IEMIMPL_MEDIA_OPT_F3 vphsubw
4408IEMIMPL_MEDIA_OPT_F3 vphsubd
4409IEMIMPL_MEDIA_OPT_F3 vphaddsw
4410IEMIMPL_MEDIA_OPT_F3 vphsubsw
4411IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4412IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4413IEMIMPL_MEDIA_OPT_F3 vpsadbw
4414IEMIMPL_MEDIA_OPT_F3 vpmuldq
4415IEMIMPL_MEDIA_OPT_F3 vpmuludq
4416IEMIMPL_MEDIA_OPT_F3 vunpcklps
4417IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4418IEMIMPL_MEDIA_OPT_F3 vunpckhps
4419IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4420IEMIMPL_MEDIA_OPT_F3 vpsubsb
4421IEMIMPL_MEDIA_OPT_F3 vpsubsw
4422IEMIMPL_MEDIA_OPT_F3 vpsubusb
4423IEMIMPL_MEDIA_OPT_F3 vpsubusw
4424IEMIMPL_MEDIA_OPT_F3 vpaddusb
4425IEMIMPL_MEDIA_OPT_F3 vpaddusw
4426IEMIMPL_MEDIA_OPT_F3 vpaddsb
4427IEMIMPL_MEDIA_OPT_F3 vpaddsw
4428IEMIMPL_MEDIA_OPT_F3 vpermilps
4429IEMIMPL_MEDIA_OPT_F3 vpermilpd
4430IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4431IEMIMPL_MEDIA_OPT_F3 vpsrlvd
4432IEMIMPL_MEDIA_OPT_F3 vpsrlvq
4433IEMIMPL_MEDIA_OPT_F3 vpsravd
4434IEMIMPL_MEDIA_OPT_F3 vpsllvd
4435IEMIMPL_MEDIA_OPT_F3 vpsllvq
4436
4437;;
4438; Media instruction working on one full sized source register, one full sized destination
4439; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4440; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4441; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4442; of either 16, 32, or 64, it acts like the max shift size)
4443;
4444; @param 1 The instruction
4445;
4446; @param A0 Pointer to the destination media register size operand (output).
4447; @param A1 Pointer to the first source media register size operand (input).
4448; @param A2 Pointer to the second source media register size operand (input).
4449;
4450%macro IEMIMPL_SHIFT_OPT_F3 1
4451BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4452 PROLOGUE_3_ARGS
4453 IEMIMPL_AVX_PROLOGUE
4454
4455 vmovdqu xmm0, [A1]
4456 vmovdqu xmm1, [A2]
4457 %1 xmm0, xmm0, xmm1
4458 vmovdqu [A0], xmm0
4459
4460 IEMIMPL_AVX_PROLOGUE
4461 EPILOGUE_3_ARGS
4462ENDPROC iemAImpl_ %+ %1 %+ _u128
4463
4464BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4465 PROLOGUE_3_ARGS
4466 IEMIMPL_AVX_PROLOGUE
4467
4468 vmovdqu ymm0, [A1]
4469 vmovdqu xmm1, [A2]
4470 %1 ymm0, ymm0, xmm1
4471 vmovdqu [A0], ymm0
4472
4473 IEMIMPL_AVX_PROLOGUE
4474 EPILOGUE_3_ARGS
4475ENDPROC iemAImpl_ %+ %1 %+ _u256
4476%endmacro
4477
4478IEMIMPL_SHIFT_OPT_F3 vpsllw
4479IEMIMPL_SHIFT_OPT_F3 vpslld
4480IEMIMPL_SHIFT_OPT_F3 vpsllq
4481IEMIMPL_SHIFT_OPT_F3 vpsraw
4482IEMIMPL_SHIFT_OPT_F3 vpsrad
4483IEMIMPL_SHIFT_OPT_F3 vpsrlw
4484IEMIMPL_SHIFT_OPT_F3 vpsrld
4485IEMIMPL_SHIFT_OPT_F3 vpsrlq
4486
4487
4488;;
4489; Media instruction working on one full sized source registers and one destination (AVX),
4490; but no XSAVE state pointer argument.
4491;
4492; @param 1 The instruction
4493; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4494;
4495; @param A0 Pointer to the destination media register size operand (output).
4496; @param A1 Pointer to the source media register size operand (input).
4497;
4498%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4499BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4500 PROLOGUE_2_ARGS
4501 IEMIMPL_AVX_PROLOGUE
4502
4503 vmovdqu xmm0, [A1]
4504 %1 xmm0, xmm0
4505 vmovdqu [A0], xmm0
4506
4507 IEMIMPL_AVX_PROLOGUE
4508 EPILOGUE_2_ARGS
4509ENDPROC iemAImpl_ %+ %1 %+ _u128
4510
4511 %if %2 == 1
4512BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4513 PROLOGUE_2_ARGS
4514 IEMIMPL_AVX_PROLOGUE
4515
4516 vmovdqu ymm0, [A1]
4517 %1 ymm0, ymm0
4518 vmovdqu [A0], ymm0
4519
4520 IEMIMPL_AVX_PROLOGUE
4521 EPILOGUE_2_ARGS
4522ENDPROC iemAImpl_ %+ %1 %+ _u256
4523 %endif
4524%endmacro
4525
4526IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4527IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4528IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4529IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4530
4531
4532;
4533; The SSE 4.2 crc32
4534;
4535; @param A1 Pointer to the 32-bit destination.
4536; @param A2 The source operand, sized according to the suffix.
4537;
4538BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4539 PROLOGUE_2_ARGS
4540
4541 mov T0_32, [A0]
4542 crc32 T0_32, A1_8
4543 mov [A0], T0_32
4544
4545 EPILOGUE_2_ARGS
4546ENDPROC iemAImpl_crc32_u8
4547
4548BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4549 PROLOGUE_2_ARGS
4550
4551 mov T0_32, [A0]
4552 crc32 T0_32, A1_16
4553 mov [A0], T0_32
4554
4555 EPILOGUE_2_ARGS
4556ENDPROC iemAImpl_crc32_u16
4557
4558BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4559 PROLOGUE_2_ARGS
4560
4561 mov T0_32, [A0]
4562 crc32 T0_32, A1_32
4563 mov [A0], T0_32
4564
4565 EPILOGUE_2_ARGS
4566ENDPROC iemAImpl_crc32_u32
4567
4568%ifdef RT_ARCH_AMD64
4569BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4570 PROLOGUE_2_ARGS
4571
4572 mov T0_32, [A0]
4573 crc32 T0, A1
4574 mov [A0], T0_32
4575
4576 EPILOGUE_2_ARGS
4577ENDPROC iemAImpl_crc32_u64
4578%endif
4579
4580
4581;
4582; PTEST (SSE 4.1)
4583;
4584; @param A0 Pointer to the first source operand (aka readonly destination).
4585; @param A1 Pointer to the second source operand.
4586; @param A2 Pointer to the EFLAGS register.
4587;
4588BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4589 PROLOGUE_3_ARGS
4590 IEMIMPL_SSE_PROLOGUE
4591
4592 movdqu xmm0, [A0]
4593 movdqu xmm1, [A1]
4594 ptest xmm0, xmm1
4595 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4596
4597 IEMIMPL_SSE_EPILOGUE
4598 EPILOGUE_3_ARGS
4599ENDPROC iemAImpl_ptest_u128
4600
4601BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4602 PROLOGUE_3_ARGS
4603 IEMIMPL_SSE_PROLOGUE
4604
4605 vmovdqu ymm0, [A0]
4606 vmovdqu ymm1, [A1]
4607 vptest ymm0, ymm1
4608 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4609
4610 IEMIMPL_SSE_EPILOGUE
4611 EPILOGUE_3_ARGS
4612ENDPROC iemAImpl_vptest_u256
4613
4614
4615;;
4616; Template for the [v]pmov{s,z}x* instructions
4617;
4618; @param 1 The instruction
4619;
4620; @param A0 Pointer to the destination media register size operand (output).
4621; @param A1 The source operand value (input).
4622;
4623%macro IEMIMPL_V_PMOV_SZ_X 1
4624BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4625 PROLOGUE_2_ARGS
4626 IEMIMPL_SSE_PROLOGUE
4627
4628 movd xmm0, A1
4629 %1 xmm0, xmm0
4630 vmovdqu [A0], xmm0
4631
4632 IEMIMPL_SSE_PROLOGUE
4633 EPILOGUE_2_ARGS
4634ENDPROC iemAImpl_ %+ %1 %+ _u128
4635
4636BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4637 PROLOGUE_2_ARGS
4638 IEMIMPL_AVX_PROLOGUE
4639
4640 movd xmm0, A1
4641 v %+ %1 xmm0, xmm0
4642 vmovdqu [A0], xmm0
4643
4644 IEMIMPL_AVX_PROLOGUE
4645 EPILOGUE_2_ARGS
4646ENDPROC iemAImpl_v %+ %1 %+ _u128
4647
4648BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4649 PROLOGUE_2_ARGS
4650 IEMIMPL_AVX_PROLOGUE
4651
4652 movdqu xmm0, [A1]
4653 v %+ %1 ymm0, xmm0
4654 vmovdqu [A0], ymm0
4655
4656 IEMIMPL_AVX_PROLOGUE
4657 EPILOGUE_2_ARGS
4658ENDPROC iemAImpl_v %+ %1 %+ _u256
4659%endmacro
4660
4661IEMIMPL_V_PMOV_SZ_X pmovsxbw
4662IEMIMPL_V_PMOV_SZ_X pmovsxbd
4663IEMIMPL_V_PMOV_SZ_X pmovsxbq
4664IEMIMPL_V_PMOV_SZ_X pmovsxwd
4665IEMIMPL_V_PMOV_SZ_X pmovsxwq
4666IEMIMPL_V_PMOV_SZ_X pmovsxdq
4667
4668IEMIMPL_V_PMOV_SZ_X pmovzxbw
4669IEMIMPL_V_PMOV_SZ_X pmovzxbd
4670IEMIMPL_V_PMOV_SZ_X pmovzxbq
4671IEMIMPL_V_PMOV_SZ_X pmovzxwd
4672IEMIMPL_V_PMOV_SZ_X pmovzxwq
4673IEMIMPL_V_PMOV_SZ_X pmovzxdq
4674
4675
4676;;
4677; Need to move this as well somewhere better?
4678;
4679struc IEMSSERESULT
4680 .uResult resd 4
4681 .MXCSR resd 1
4682endstruc
4683
4684
4685;;
4686; Need to move this as well somewhere better?
4687;
4688struc IEMAVX128RESULT
4689 .uResult resd 4
4690 .MXCSR resd 1
4691endstruc
4692
4693
4694;;
4695; Need to move this as well somewhere better?
4696;
4697struc IEMAVX256RESULT
4698 .uResult resd 8
4699 .MXCSR resd 1
4700endstruc
4701
4702
4703;;
4704; Initialize the SSE MXCSR register using the guest value partially to
4705; account for rounding mode.
4706;
4707; @uses 4 bytes of stack to save the original value, T0.
4708; @param 1 Expression giving the address of the FXSTATE of the guest.
4709;
4710%macro SSE_LD_FXSTATE_MXCSR 1
4711 sub xSP, 4
4712
4713 stmxcsr [xSP]
4714 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4715 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4716 or T0_32, X86_MXCSR_XCPT_MASK
4717 sub xSP, 4
4718 mov [xSP], T0_32
4719 ldmxcsr [xSP]
4720 add xSP, 4
4721%endmacro
4722
4723
4724;;
4725; Restores the SSE MXCSR register with the original value.
4726;
4727; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4728; @param 1 Expression giving the address where to return the MXCSR value.
4729; @param 2 Expression giving the address of the FXSTATE of the guest.
4730;
4731; @note Restores the stack pointer.
4732;
4733%macro SSE_ST_FXSTATE_MXCSR 2
4734 sub xSP, 4
4735 stmxcsr [xSP]
4736 mov T0_32, [xSP]
4737 add xSP, 4
4738 ; Merge the status bits into the original MXCSR value.
4739 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4740 and T0_32, X86_MXCSR_XCPT_FLAGS
4741 or T0_32, T1_32
4742 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4743
4744 ldmxcsr [xSP]
4745 add xSP, 4
4746%endmacro
4747
4748
4749;;
4750; Initialize the SSE MXCSR register using the guest value partially to
4751; account for rounding mode.
4752;
4753; @uses 4 bytes of stack to save the original value.
4754; @param 1 Expression giving the address of the FXSTATE of the guest.
4755;
4756%macro AVX_LD_XSAVEAREA_MXCSR 1
4757 sub xSP, 4
4758
4759 stmxcsr [xSP]
4760 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4761 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4762 sub xSP, 4
4763 mov [xSP], T0_32
4764 ldmxcsr [xSP]
4765 add xSP, 4
4766%endmacro
4767
4768
4769;;
4770; Restores the AVX128 MXCSR register with the original value.
4771;
4772; @param 1 Expression giving the address where to return the MXCSR value.
4773;
4774; @note Restores the stack pointer.
4775;
4776%macro AVX128_ST_XSAVEAREA_MXCSR 1
4777 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4778
4779 ldmxcsr [xSP]
4780 add xSP, 4
4781%endmacro
4782
4783
4784;;
4785; Restores the AVX256 MXCSR register with the original value.
4786;
4787; @param 1 Expression giving the address where to return the MXCSR value.
4788;
4789; @note Restores the stack pointer.
4790;
4791%macro AVX256_ST_XSAVEAREA_MXCSR 1
4792 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4793
4794 ldmxcsr [xSP]
4795 add xSP, 4
4796%endmacro
4797
4798
4799;;
4800; Floating point instruction working on two full sized registers.
4801;
4802; @param 1 The instruction
4803; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4804;
4805; @param A0 FPU context (FXSTATE or XSAVEAREA).
4806; @param A1 Where to return the result including the MXCSR value.
4807; @param A2 Pointer to the first media register size operand (input/output).
4808; @param A3 Pointer to the second media register size operand (input).
4809;
4810%macro IEMIMPL_FP_F2 2
4811BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4812 PROLOGUE_4_ARGS
4813 IEMIMPL_SSE_PROLOGUE
4814 SSE_LD_FXSTATE_MXCSR A0
4815
4816 movdqu xmm0, [A2]
4817 movdqu xmm1, [A3]
4818 %1 xmm0, xmm1
4819 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4820
4821 SSE_ST_FXSTATE_MXCSR A1, A0
4822 IEMIMPL_SSE_PROLOGUE
4823 EPILOGUE_4_ARGS
4824ENDPROC iemAImpl_ %+ %1 %+ _u128
4825
4826 %if %2 == 3
4827BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4828 PROLOGUE_4_ARGS
4829 IEMIMPL_AVX_PROLOGUE
4830 AVX_LD_XSAVEAREA_MXCSR A0
4831
4832 vmovdqu xmm0, [A2]
4833 vmovdqu xmm1, [A3]
4834 v %+ %1 xmm0, xmm0, xmm1
4835 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4836
4837 AVX128_ST_XSAVEAREA_MXCSR A1
4838 IEMIMPL_AVX_PROLOGUE
4839 EPILOGUE_4_ARGS
4840ENDPROC iemAImpl_v %+ %1 %+ _u128
4841
4842BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4843 PROLOGUE_4_ARGS
4844 IEMIMPL_AVX_PROLOGUE
4845 AVX_LD_XSAVEAREA_MXCSR A0
4846
4847 vmovdqu ymm0, [A2]
4848 vmovdqu ymm1, [A3]
4849 v %+ %1 ymm0, ymm0, ymm1
4850 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4851
4852 AVX256_ST_XSAVEAREA_MXCSR A1
4853 IEMIMPL_AVX_PROLOGUE
4854 EPILOGUE_4_ARGS
4855ENDPROC iemAImpl_v %+ %1 %+ _u256
4856 %elif %2 == 2
4857BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4858 PROLOGUE_4_ARGS
4859 IEMIMPL_AVX_PROLOGUE
4860 AVX_LD_XSAVEAREA_MXCSR A0
4861
4862 vmovdqu xmm0, [A2]
4863 vmovdqu xmm1, [A3]
4864 v %+ %1 xmm0, xmm1
4865 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4866
4867 AVX128_ST_XSAVEAREA_MXCSR A1
4868 IEMIMPL_AVX_PROLOGUE
4869 EPILOGUE_4_ARGS
4870ENDPROC iemAImpl_v %+ %1 %+ _u128
4871
4872BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4873 PROLOGUE_4_ARGS
4874 IEMIMPL_AVX_PROLOGUE
4875 AVX_LD_XSAVEAREA_MXCSR A0
4876
4877 vmovdqu ymm0, [A2]
4878 vmovdqu ymm1, [A3]
4879 v %+ %1 ymm0, ymm1
4880 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4881
4882 AVX256_ST_XSAVEAREA_MXCSR A1
4883 IEMIMPL_AVX_PROLOGUE
4884 EPILOGUE_4_ARGS
4885ENDPROC iemAImpl_v %+ %1 %+ _u256
4886 %endif
4887%endmacro
4888
4889IEMIMPL_FP_F2 addps, 3
4890IEMIMPL_FP_F2 addpd, 3
4891IEMIMPL_FP_F2 mulps, 3
4892IEMIMPL_FP_F2 mulpd, 3
4893IEMIMPL_FP_F2 subps, 3
4894IEMIMPL_FP_F2 subpd, 3
4895IEMIMPL_FP_F2 minps, 3
4896IEMIMPL_FP_F2 minpd, 3
4897IEMIMPL_FP_F2 divps, 3
4898IEMIMPL_FP_F2 divpd, 3
4899IEMIMPL_FP_F2 maxps, 3
4900IEMIMPL_FP_F2 maxpd, 3
4901IEMIMPL_FP_F2 haddps, 3
4902IEMIMPL_FP_F2 haddpd, 3
4903IEMIMPL_FP_F2 hsubps, 3
4904IEMIMPL_FP_F2 hsubpd, 3
4905IEMIMPL_FP_F2 addsubps, 3
4906IEMIMPL_FP_F2 addsubpd, 3
4907
4908
4909;;
4910; These are actually unary operations but to keep it simple
4911; we treat them as binary for now, so the output result is
4912; always in sync with the register where the result might get written
4913; to.
4914IEMIMPL_FP_F2 sqrtps, 2
4915IEMIMPL_FP_F2 rsqrtps, 2
4916IEMIMPL_FP_F2 sqrtpd, 2
4917IEMIMPL_FP_F2 rcpps, 2
4918IEMIMPL_FP_F2 cvtdq2ps, 2
4919IEMIMPL_FP_F2 cvtps2dq, 2
4920IEMIMPL_FP_F2 cvttps2dq, 2
4921IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4922IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4923IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4924
4925
4926;;
4927; Floating point instruction working on a full sized register and a single precision operand.
4928;
4929; @param 1 The instruction
4930;
4931; @param A0 FPU context (FXSTATE or XSAVEAREA).
4932; @param A1 Where to return the result including the MXCSR value.
4933; @param A2 Pointer to the first media register size operand (input/output).
4934; @param A3 Pointer to the second single precision floating point value (input).
4935;
4936%macro IEMIMPL_FP_F2_R32 1
4937BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4938 PROLOGUE_4_ARGS
4939 IEMIMPL_SSE_PROLOGUE
4940 SSE_LD_FXSTATE_MXCSR A0
4941
4942 movdqu xmm0, [A2]
4943 movd xmm1, [A3]
4944 %1 xmm0, xmm1
4945 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4946
4947 SSE_ST_FXSTATE_MXCSR A1, A0
4948 IEMIMPL_SSE_EPILOGUE
4949 EPILOGUE_4_ARGS
4950ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4951
4952BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4953 PROLOGUE_4_ARGS
4954 IEMIMPL_AVX_PROLOGUE
4955 AVX_LD_XSAVEAREA_MXCSR A0
4956
4957 vmovdqu xmm0, [A2]
4958 vmovd xmm1, [A3]
4959 v %+ %1 xmm0, xmm0, xmm1
4960 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4961
4962 AVX128_ST_XSAVEAREA_MXCSR A1
4963 IEMIMPL_AVX_PROLOGUE
4964 EPILOGUE_4_ARGS
4965ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4966%endmacro
4967
4968IEMIMPL_FP_F2_R32 addss
4969IEMIMPL_FP_F2_R32 mulss
4970IEMIMPL_FP_F2_R32 subss
4971IEMIMPL_FP_F2_R32 minss
4972IEMIMPL_FP_F2_R32 divss
4973IEMIMPL_FP_F2_R32 maxss
4974IEMIMPL_FP_F2_R32 cvtss2sd
4975IEMIMPL_FP_F2_R32 sqrtss
4976IEMIMPL_FP_F2_R32 rsqrtss
4977IEMIMPL_FP_F2_R32 rcpss
4978
4979
4980;;
4981; Floating point instruction working on a full sized register and a double precision operand.
4982;
4983; @param 1 The instruction
4984;
4985; @param A0 FPU context (FXSTATE or XSAVEAREA).
4986; @param A1 Where to return the result including the MXCSR value.
4987; @param A2 Pointer to the first media register size operand (input/output).
4988; @param A3 Pointer to the second double precision floating point value (input).
4989;
4990%macro IEMIMPL_FP_F2_R64 1
4991BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4992 PROLOGUE_4_ARGS
4993 IEMIMPL_SSE_PROLOGUE
4994 SSE_LD_FXSTATE_MXCSR A0
4995
4996 movdqu xmm0, [A2]
4997 movq xmm1, [A3]
4998 %1 xmm0, xmm1
4999 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5000
5001 SSE_ST_FXSTATE_MXCSR A1, A0
5002 IEMIMPL_SSE_EPILOGUE
5003 EPILOGUE_4_ARGS
5004ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5005
5006BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5007 PROLOGUE_4_ARGS
5008 IEMIMPL_AVX_PROLOGUE
5009 AVX_LD_XSAVEAREA_MXCSR A0
5010
5011 vmovdqu xmm0, [A2]
5012 vmovq xmm1, [A3]
5013 v %+ %1 xmm0, xmm0, xmm1
5014 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5015
5016 AVX128_ST_XSAVEAREA_MXCSR A1
5017 IEMIMPL_AVX_EPILOGUE
5018 EPILOGUE_4_ARGS
5019ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5020%endmacro
5021
5022IEMIMPL_FP_F2_R64 addsd
5023IEMIMPL_FP_F2_R64 mulsd
5024IEMIMPL_FP_F2_R64 subsd
5025IEMIMPL_FP_F2_R64 minsd
5026IEMIMPL_FP_F2_R64 divsd
5027IEMIMPL_FP_F2_R64 maxsd
5028IEMIMPL_FP_F2_R64 cvtsd2ss
5029IEMIMPL_FP_F2_R64 sqrtsd
5030
5031
5032;;
5033; Macro for the cvtpd2ps/cvtps2pd instructions.
5034;
5035; 1 The instruction name.
5036; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5037;
5038; @param A0 FPU context (FXSTATE or XSAVEAREA).
5039; @param A1 Where to return the result including the MXCSR value.
5040; @param A2 Pointer to the first media register size operand (input/output).
5041; @param A3 Pointer to the second media register size operand (input).
5042;
5043%macro IEMIMPL_CVT_F2 2
5044BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5045 PROLOGUE_4_ARGS
5046 IEMIMPL_SSE_PROLOGUE
5047 SSE_LD_FXSTATE_MXCSR A0
5048
5049 movdqu xmm0, [A2]
5050 movdqu xmm1, [A3]
5051 %1 xmm0, xmm1
5052 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5053
5054 SSE_ST_FXSTATE_MXCSR A1, A0
5055 IEMIMPL_SSE_EPILOGUE
5056 EPILOGUE_4_ARGS
5057ENDPROC iemAImpl_ %+ %1 %+ _u128
5058
5059BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5060 PROLOGUE_4_ARGS
5061 IEMIMPL_AVX_PROLOGUE
5062 AVX_LD_XSAVEAREA_MXCSR A0
5063
5064 vmovdqu xmm0, [A2]
5065 vmovdqu xmm1, [A3]
5066 v %+ %1 xmm0, xmm1
5067 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5068
5069 AVX128_ST_XSAVEAREA_MXCSR A1
5070 IEMIMPL_AVX_EPILOGUE
5071 EPILOGUE_4_ARGS
5072ENDPROC iemAImpl_v %+ %1 %+ _u128
5073
5074BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5075 PROLOGUE_4_ARGS
5076 IEMIMPL_AVX_PROLOGUE
5077 AVX_LD_XSAVEAREA_MXCSR A0
5078
5079 vmovdqu ymm0, [A2]
5080 vmovdqu ymm1, [A3]
5081 %if %2 == 0
5082 v %+ %1 xmm0, ymm1
5083 %else
5084 v %+ %1 ymm0, xmm1
5085 %endif
5086 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5087
5088 AVX256_ST_XSAVEAREA_MXCSR A1
5089 IEMIMPL_AVX_EPILOGUE
5090 EPILOGUE_4_ARGS
5091ENDPROC iemAImpl_v %+ %1 %+ _u256
5092%endmacro
5093
5094IEMIMPL_CVT_F2 cvtpd2ps, 0
5095IEMIMPL_CVT_F2 cvtps2pd, 1
5096
5097
5098;;
5099; shufps instructions with 8-bit immediates.
5100;
5101; @param A0 Pointer to the destination media register size operand (input/output).
5102; @param A1 Pointer to the first source media register size operand (input).
5103; @param A2 The 8-bit immediate
5104;
5105BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5106 PROLOGUE_3_ARGS
5107 IEMIMPL_SSE_PROLOGUE
5108
5109 movzx A2, A2_8 ; must clear top bits
5110 movdqu xmm0, [A0]
5111 movdqu xmm1, [A1]
5112 lea T1, [.imm0 xWrtRIP]
5113 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5114 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5115 %else
5116 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5117 %endif
5118 lea T1, [T1 + T0*2]
5119 IBT_NOTRACK
5120 call T1
5121 movdqu [A0], xmm0
5122
5123 IEMIMPL_SSE_EPILOGUE
5124 EPILOGUE_3_ARGS
5125 %assign bImm 0
5126 %rep 256
5127.imm %+ bImm:
5128 IBT_ENDBRxx_WITHOUT_NOTRACK
5129 shufps xmm0, xmm1, bImm
5130 ret
5131 int3
5132 %assign bImm bImm + 1
5133 %endrep
5134.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5135ENDPROC iemAImpl_shufps_u128
5136
5137
5138;;
5139; shufpd instruction with 8-bit immediates.
5140;
5141; @param A0 Pointer to the destination media register size operand (input/output).
5142; @param A1 Pointer to the first source media register size operand (input).
5143; @param A2 The 8-bit immediate
5144;
5145BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5146 PROLOGUE_3_ARGS
5147 IEMIMPL_SSE_PROLOGUE
5148
5149 movzx A2, A2_8 ; must clear top bits
5150 movdqu xmm0, [A0]
5151 movdqu xmm1, [A1]
5152 lea T1, [.imm0 xWrtRIP]
5153 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5154 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5155 %else
5156 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5157 %endif
5158 lea T1, [T1 + T0*2]
5159 IBT_NOTRACK
5160 call T1
5161 movdqu [A0], xmm0
5162
5163 IEMIMPL_SSE_EPILOGUE
5164 EPILOGUE_3_ARGS
5165 %assign bImm 0
5166 %rep 256
5167.imm %+ bImm:
5168 IBT_ENDBRxx_WITHOUT_NOTRACK
5169 shufpd xmm0, xmm1, bImm
5170 ret
5171 %assign bImm bImm + 1
5172 %endrep
5173.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5174ENDPROC iemAImpl_shufpd_u128
5175
5176
5177;;
5178; vshufp{s,d} instructions with 8-bit immediates.
5179;
5180; @param 1 The instruction name.
5181;
5182; @param A0 Pointer to the destination media register size operand (output).
5183; @param A1 Pointer to the first source media register size operand (input).
5184; @param A2 Pointer to the second source media register size operand (input).
5185; @param A3 The 8-bit immediate
5186;
5187%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5188BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5189 PROLOGUE_4_ARGS
5190 IEMIMPL_AVX_PROLOGUE
5191
5192 movzx A3, A3_8 ; must clear top bits
5193 movdqu xmm0, [A1]
5194 movdqu xmm1, [A2]
5195 lea T1, [.imm0 xWrtRIP]
5196 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5197 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5198 %else
5199 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5200 %endif
5201 lea T1, [T1 + T0*2]
5202 IBT_NOTRACK
5203 call T1
5204 movdqu [A0], xmm0
5205
5206 IEMIMPL_AVX_EPILOGUE
5207 EPILOGUE_4_ARGS
5208 %assign bImm 0
5209 %rep 256
5210.imm %+ bImm:
5211 IBT_ENDBRxx_WITHOUT_NOTRACK
5212 %1 xmm0, xmm0, xmm1, bImm
5213 ret
5214 %assign bImm bImm + 1
5215 %endrep
5216.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5217ENDPROC iemAImpl_ %+ %1 %+ _u128
5218
5219BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5220 PROLOGUE_4_ARGS
5221 IEMIMPL_AVX_PROLOGUE
5222
5223 movzx A3, A3_8 ; must clear top bits
5224 vmovdqu ymm0, [A1]
5225 vmovdqu ymm1, [A2]
5226 lea T1, [.imm0 xWrtRIP]
5227 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5228 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5229 %else
5230 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5231 %endif
5232 lea T1, [T1 + T0*2]
5233 IBT_NOTRACK
5234 call T1
5235 vmovdqu [A0], ymm0
5236
5237 IEMIMPL_AVX_EPILOGUE
5238 EPILOGUE_4_ARGS
5239 %assign bImm 0
5240 %rep 256
5241.imm %+ bImm:
5242 IBT_ENDBRxx_WITHOUT_NOTRACK
5243 %1 ymm0, ymm0, ymm1, bImm
5244 ret
5245 %assign bImm bImm + 1
5246 %endrep
5247.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5248ENDPROC iemAImpl_ %+ %1 %+ _u256
5249%endmacro
5250
5251IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5252IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5253
5254
5255;;
5256; One of the [p]blendv{b,ps,pd} variants
5257;
5258; @param 1 The instruction
5259;
5260; @param A0 Pointer to the first media register sized operand (input/output).
5261; @param A1 Pointer to the second media sized value (input).
5262; @param A2 Pointer to the media register sized mask value (input).
5263;
5264%macro IEMIMPL_P_BLEND 1
5265BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5266 PROLOGUE_3_ARGS
5267 IEMIMPL_SSE_PROLOGUE
5268
5269 movdqu xmm0, [A2] ; This is implicit
5270 movdqu xmm1, [A0]
5271 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5272 %1 xmm1, xmm2
5273 movdqu [A0], xmm1
5274
5275 IEMIMPL_SSE_PROLOGUE
5276 EPILOGUE_3_ARGS
5277ENDPROC iemAImpl_ %+ %1 %+ _u128
5278%endmacro
5279
5280IEMIMPL_P_BLEND pblendvb
5281IEMIMPL_P_BLEND blendvps
5282IEMIMPL_P_BLEND blendvpd
5283
5284
5285;;
5286; One of the v[p]blendv{b,ps,pd} variants
5287;
5288; @param 1 The instruction
5289;
5290; @param A0 Pointer to the first media register sized operand (output).
5291; @param A1 Pointer to the first media register sized operand (input).
5292; @param A2 Pointer to the second media register sized operand (input).
5293; @param A3 Pointer to the media register sized mask value (input).
5294%macro IEMIMPL_AVX_P_BLEND 1
5295BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5296 PROLOGUE_4_ARGS
5297 IEMIMPL_AVX_PROLOGUE
5298
5299 vmovdqu xmm0, [A1]
5300 vmovdqu xmm1, [A2]
5301 vmovdqu xmm2, [A3]
5302 %1 xmm0, xmm0, xmm1, xmm2
5303 vmovdqu [A0], xmm0
5304
5305 IEMIMPL_AVX_PROLOGUE
5306 EPILOGUE_4_ARGS
5307ENDPROC iemAImpl_ %+ %1 %+ _u128
5308
5309BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5310 PROLOGUE_4_ARGS
5311 IEMIMPL_AVX_PROLOGUE
5312
5313 vmovdqu ymm0, [A1]
5314 vmovdqu ymm1, [A2]
5315 vmovdqu ymm2, [A3]
5316 %1 ymm0, ymm0, ymm1, ymm2
5317 vmovdqu [A0], ymm0
5318
5319 IEMIMPL_AVX_PROLOGUE
5320 EPILOGUE_4_ARGS
5321ENDPROC iemAImpl_ %+ %1 %+ _u256
5322%endmacro
5323
5324IEMIMPL_AVX_P_BLEND vpblendvb
5325IEMIMPL_AVX_P_BLEND vblendvps
5326IEMIMPL_AVX_P_BLEND vblendvpd
5327
5328
5329;;
5330; palignr mm1, mm2/m64 instruction.
5331;
5332; @param A0 Pointer to the first media register sized operand (output).
5333; @param A1 The second register sized operand (input).
5334; @param A2 The 8-bit immediate.
5335BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5336 PROLOGUE_3_ARGS
5337 IEMIMPL_MMX_PROLOGUE
5338
5339 movzx A2, A2_8 ; must clear top bits
5340 movq mm0, [A0]
5341 movq mm1, A1
5342 lea T1, [.imm0 xWrtRIP]
5343 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5344 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5345 %else
5346 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5347 %endif
5348 lea T1, [T1 + T0*2]
5349 IBT_NOTRACK
5350 call T1
5351 movq [A0], mm0
5352
5353 IEMIMPL_MMX_EPILOGUE
5354 EPILOGUE_3_ARGS
5355 %assign bImm 0
5356 %rep 256
5357.imm %+ bImm:
5358 IBT_ENDBRxx_WITHOUT_NOTRACK
5359 palignr mm0, mm1, bImm
5360 ret
5361 %assign bImm bImm + 1
5362 %endrep
5363.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5364ENDPROC iemAImpl_palignr_u64
5365
5366
5367;;
5368; SSE instructions with 8-bit immediates of the form
5369; xxx xmm1, xmm2, imm8.
5370; where the instruction encoding takes up 6 bytes.
5371;
5372; @param 1 The instruction name.
5373;
5374; @param A0 Pointer to the first media register size operand (input/output).
5375; @param A1 Pointer to the second source media register size operand (input).
5376; @param A2 The 8-bit immediate
5377;
5378%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5379BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5380 PROLOGUE_3_ARGS
5381 IEMIMPL_SSE_PROLOGUE
5382
5383 movzx A2, A2_8 ; must clear top bits
5384 movdqu xmm0, [A0]
5385 movdqu xmm1, [A1]
5386 lea T1, [.imm0 xWrtRIP]
5387 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5388 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5389 lea T1, [T1 + T0*4]
5390 %else
5391 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5392 %endif
5393 IBT_NOTRACK
5394 call T1
5395 movdqu [A0], xmm0
5396
5397 IEMIMPL_SSE_EPILOGUE
5398 EPILOGUE_3_ARGS
5399 %assign bImm 0
5400 %rep 256
5401.imm %+ bImm:
5402 IBT_ENDBRxx_WITHOUT_NOTRACK
5403 %1 xmm0, xmm1, bImm
5404 ret
5405 int3
5406 %assign bImm bImm + 1
5407 %endrep
5408.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5409ENDPROC iemAImpl_ %+ %1 %+ _u128
5410%endmacro
5411
5412IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5413IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5414IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5415IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5416IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5417IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5418IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5419
5420
5421;;
5422; AVX instructions with 8-bit immediates of the form
5423; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5424; where the instruction encoding takes up 6 bytes.
5425;
5426; @param 1 The instruction name.
5427; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5428; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5429;
5430; @param A0 Pointer to the destination media register size operand (output).
5431; @param A1 Pointer to the first source media register size operand (input).
5432; @param A2 Pointer to the second source media register size operand (input).
5433; @param A3 The 8-bit immediate
5434;
5435%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5436 %if %2 == 1
5437BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5438 PROLOGUE_4_ARGS
5439 IEMIMPL_AVX_PROLOGUE
5440
5441 movzx A3, A3_8 ; must clear top bits
5442 movdqu xmm0, [A1]
5443 movdqu xmm1, [A2]
5444 lea T1, [.imm0 xWrtRIP]
5445 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5446 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5447 lea T1, [T1 + T0*4]
5448 %else
5449 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5450 %endif
5451 IBT_NOTRACK
5452 call T1
5453 movdqu [A0], xmm0
5454
5455 IEMIMPL_AVX_EPILOGUE
5456 EPILOGUE_4_ARGS
5457 %assign bImm 0
5458 %rep 256
5459.imm %+ bImm:
5460 IBT_ENDBRxx_WITHOUT_NOTRACK
5461 %1 xmm0, xmm0, xmm1, bImm
5462 ret
5463 int3
5464 %assign bImm bImm + 1
5465 %endrep
5466.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5467ENDPROC iemAImpl_ %+ %1 %+ _u128
5468 %endif
5469
5470 %if %3 == 1
5471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5472 PROLOGUE_4_ARGS
5473 IEMIMPL_AVX_PROLOGUE
5474
5475 movzx A3, A3_8 ; must clear top bits
5476 vmovdqu ymm0, [A1]
5477 vmovdqu ymm1, [A2]
5478 lea T1, [.imm0 xWrtRIP]
5479 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5480 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5481 lea T1, [T1 + T0*4]
5482 %else
5483 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5484 %endif
5485 IBT_NOTRACK
5486 call T1
5487 vmovdqu [A0], ymm0
5488
5489 IEMIMPL_AVX_EPILOGUE
5490 EPILOGUE_4_ARGS
5491 %assign bImm 0
5492 %rep 256
5493.imm %+ bImm:
5494 IBT_ENDBRxx_WITHOUT_NOTRACK
5495 %1 ymm0, ymm0, ymm1, bImm
5496 ret
5497 int3
5498 %assign bImm bImm + 1
5499 %endrep
5500.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5501ENDPROC iemAImpl_ %+ %1 %+ _u256
5502 %endif
5503%endmacro
5504
5505IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5506IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5507IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5508IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5509IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5510IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5511IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5512IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5513IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5514
5515
5516;;
5517; AVX instructions with 8-bit immediates of the form
5518; xxx {x,y}mm1, {x,y}mm2, imm8.
5519; where the instruction encoding takes up 6 bytes.
5520;
5521; @param 1 The instruction name.
5522; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5523; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5524;
5525; @param A0 Pointer to the destination media register size operand (output).
5526; @param A1 Pointer to the first source media register size operand (input).
5527; @param A2 The 8-bit immediate
5528;
5529%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 3
5530 %if %2 == 1
5531BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5532 PROLOGUE_4_ARGS
5533 IEMIMPL_AVX_PROLOGUE
5534
5535 movzx A2, A2_8 ; must clear top bits
5536 movdqu xmm1, [A1]
5537 lea T1, [.imm0 xWrtRIP]
5538 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5539 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5540 lea T1, [T1 + T0*4]
5541 %else
5542 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5543 %endif
5544 IBT_NOTRACK
5545 call T1
5546 movdqu [A0], xmm0
5547
5548 IEMIMPL_AVX_EPILOGUE
5549 EPILOGUE_4_ARGS
5550 %assign bImm 0
5551 %rep 256
5552.imm %+ bImm:
5553 IBT_ENDBRxx_WITHOUT_NOTRACK
5554 %1 xmm0, xmm1, bImm
5555 ret
5556 int3
5557 %assign bImm bImm + 1
5558 %endrep
5559.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5560ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5561 %endif
5562
5563 %if %3 == 1
5564BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5565 PROLOGUE_4_ARGS
5566 IEMIMPL_AVX_PROLOGUE
5567
5568 movzx A2, A2_8 ; must clear top bits
5569 vmovdqu ymm1, [A1]
5570 lea T1, [.imm0 xWrtRIP]
5571 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5572 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5573 lea T1, [T1 + T0*4]
5574 %else
5575 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5576 %endif
5577 IBT_NOTRACK
5578 call T1
5579 vmovdqu [A0], ymm0
5580
5581 IEMIMPL_AVX_EPILOGUE
5582 EPILOGUE_4_ARGS
5583 %assign bImm 0
5584 %rep 256
5585.imm %+ bImm:
5586 IBT_ENDBRxx_WITHOUT_NOTRACK
5587 %1 ymm0, ymm1, bImm
5588 ret
5589 int3
5590 %assign bImm bImm + 1
5591 %endrep
5592.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5593ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5594 %endif
5595%endmacro
5596
5597IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilps, 1, 1
5598IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilpd, 1, 1
5599
5600
5601;;
5602; Need to move this as well somewhere better?
5603;
5604struc IEMPCMPISTRXSRC
5605 .uSrc1 resd 4
5606 .uSrc2 resd 4
5607endstruc
5608
5609struc IEMPCMPESTRXSRC
5610 .uSrc1 resd 4
5611 .uSrc2 resd 4
5612 .u64Rax resd 2
5613 .u64Rdx resd 2
5614endstruc
5615
5616;;
5617; The pcmpistri instruction.
5618;
5619; @param A0 Pointer to the ECX register to store the result to (output).
5620; @param A1 Pointer to the EFLAGS register.
5621; @param A2 Pointer to the structure containing the source operands (input).
5622; @param A3 The 8-bit immediate
5623;
5624BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5625 PROLOGUE_4_ARGS
5626 IEMIMPL_SSE_PROLOGUE
5627
5628 movzx A3, A3_8 ; must clear top bits
5629 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5630 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5631 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5632 lea T1, [.imm0 xWrtRIP]
5633 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5634 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5635 lea T1, [T1 + T0*4]
5636 %else
5637 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5638 %endif
5639 IBT_NOTRACK
5640 call T1
5641
5642 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5643 mov [T2], ecx
5644
5645 IEMIMPL_SSE_EPILOGUE
5646 EPILOGUE_4_ARGS
5647 %assign bImm 0
5648 %rep 256
5649.imm %+ bImm:
5650 IBT_ENDBRxx_WITHOUT_NOTRACK
5651 pcmpistri xmm0, xmm1, bImm
5652 ret
5653 int3
5654 %assign bImm bImm + 1
5655 %endrep
5656.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5657ENDPROC iemAImpl_pcmpistri_u128
5658
5659;;
5660; The pcmpestri instruction.
5661;
5662; @param A0 Pointer to the ECX register to store the result to (output).
5663; @param A1 Pointer to the EFLAGS register.
5664; @param A2 Pointer to the structure containing the source operands (input).
5665; @param A3 The 8-bit immediate
5666;
5667BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5668 PROLOGUE_4_ARGS
5669 IEMIMPL_SSE_PROLOGUE
5670
5671 movzx A3, A3_8 ; must clear top bits
5672 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5673 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5674 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5675 lea T1, [.imm0 xWrtRIP]
5676 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5677 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5678 lea T1, [T1 + T0*4]
5679 %else
5680 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5681 %endif
5682 push xDX ; xDX can be A1 or A2 depending on the calling convention
5683 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5684 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5685 IBT_NOTRACK
5686 call T1
5687
5688 pop xDX
5689 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5690 mov [T2], ecx
5691
5692 IEMIMPL_SSE_EPILOGUE
5693 EPILOGUE_4_ARGS
5694 %assign bImm 0
5695 %rep 256
5696.imm %+ bImm:
5697 IBT_ENDBRxx_WITHOUT_NOTRACK
5698 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5699 pcmpestri xmm0, xmm1, bImm
5700 ret
5701 %assign bImm bImm + 1
5702 %endrep
5703.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5704ENDPROC iemAImpl_pcmpestri_u128
5705
5706;;
5707; The pcmpistrm instruction template.
5708;
5709; @param A0 Pointer to the XMM0 register to store the result to (output).
5710; @param A1 Pointer to the EFLAGS register.
5711; @param A2 Pointer to the structure containing the source operands (input).
5712; @param A3 The 8-bit immediate
5713;
5714BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5715 PROLOGUE_4_ARGS
5716 IEMIMPL_SSE_PROLOGUE
5717
5718 movzx A3, A3_8 ; must clear top bits
5719 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5720 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5721 lea T1, [.imm0 xWrtRIP]
5722 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5723 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5724 lea T1, [T1 + T0*4]
5725 %else
5726 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5727 %endif
5728 IBT_NOTRACK
5729 call T1
5730
5731 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5732 movdqu [A0], xmm0
5733
5734 IEMIMPL_SSE_EPILOGUE
5735 EPILOGUE_4_ARGS
5736 %assign bImm 0
5737 %rep 256
5738.imm %+ bImm:
5739 IBT_ENDBRxx_WITHOUT_NOTRACK
5740 pcmpistrm xmm1, xmm2, bImm
5741 ret
5742 int3
5743 %assign bImm bImm + 1
5744 %endrep
5745.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5746ENDPROC iemAImpl_pcmpistrm_u128
5747
5748;;
5749; The pcmpestrm instruction template.
5750;
5751; @param A0 Pointer to the XMM0 register to store the result to (output).
5752; @param A1 Pointer to the EFLAGS register.
5753; @param A2 Pointer to the structure containing the source operands (input).
5754; @param A3 The 8-bit immediate
5755;
5756BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5757 PROLOGUE_4_ARGS
5758 IEMIMPL_SSE_PROLOGUE
5759
5760 movzx A3, A3_8 ; must clear top bits
5761 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5762 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5763 lea T1, [.imm0 xWrtRIP]
5764 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5765 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5766 lea T1, [T1 + T0*4]
5767 %else
5768 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5769 %endif
5770 push xDX ; xDX can be A1 or A2 depending on the calling convention
5771 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5772 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5773 IBT_NOTRACK
5774 call T1
5775
5776 pop xDX
5777 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5778 movdqu [A0], xmm0
5779
5780 IEMIMPL_SSE_EPILOGUE
5781 EPILOGUE_4_ARGS
5782 %assign bImm 0
5783 %rep 256
5784.imm %+ bImm:
5785 IBT_ENDBRxx_WITHOUT_NOTRACK
5786 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5787 pcmpestrm xmm1, xmm2, bImm
5788 ret
5789 %assign bImm bImm + 1
5790 %endrep
5791.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5792ENDPROC iemAImpl_pcmpestrm_u128
5793
5794
5795;;
5796; pinsrw instruction.
5797;
5798; @param A0 Pointer to the first media register size operand (input/output).
5799; @param A1 The 16 bit input operand (input).
5800; @param A2 The 8-bit immediate
5801;
5802BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5803 PROLOGUE_3_ARGS
5804 IEMIMPL_SSE_PROLOGUE
5805
5806 movzx A2, A2_8 ; must clear top bits
5807 movq mm0, [A0]
5808 lea T1, [.imm0 xWrtRIP]
5809 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5810 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5811 %else
5812 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5813 %endif
5814 lea T1, [T1 + T0]
5815 IBT_NOTRACK
5816 call T1
5817 movq [A0], mm0
5818
5819 IEMIMPL_SSE_EPILOGUE
5820 EPILOGUE_3_ARGS
5821 %assign bImm 0
5822 %rep 256
5823.imm %+ bImm:
5824 IBT_ENDBRxx_WITHOUT_NOTRACK
5825 pinsrw mm0, A1_32, bImm
5826 ret
5827 %assign bImm bImm + 1
5828 %endrep
5829.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5830ENDPROC iemAImpl_pinsrw_u64
5831
5832BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5833 PROLOGUE_3_ARGS
5834 IEMIMPL_SSE_PROLOGUE
5835
5836 movzx A2, A2_8 ; must clear top bits
5837 movdqu xmm0, [A0]
5838 lea T1, [.imm0 xWrtRIP]
5839 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5840 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5841 %else
5842 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5843 %endif
5844 lea T1, [T1 + T0*2]
5845 IBT_NOTRACK
5846 call T1
5847 movdqu [A0], xmm0
5848
5849 IEMIMPL_SSE_EPILOGUE
5850 EPILOGUE_3_ARGS
5851 %assign bImm 0
5852 %rep 256
5853.imm %+ bImm:
5854 IBT_ENDBRxx_WITHOUT_NOTRACK
5855 pinsrw xmm0, A1_32, bImm
5856 ret
5857 %assign bImm bImm + 1
5858 %endrep
5859.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5860ENDPROC iemAImpl_pinsrw_u128
5861
5862;;
5863; vpinsrw instruction.
5864;
5865; @param A0 Pointer to the first media register size operand (output).
5866; @param A1 Pointer to the source media register size operand (input).
5867; @param A2 The 16 bit input operand (input).
5868; @param A3 The 8-bit immediate
5869;
5870BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5871 PROLOGUE_4_ARGS
5872 IEMIMPL_SSE_PROLOGUE
5873
5874 movzx A3, A3_8 ; must clear top bits
5875 movdqu xmm0, [A1]
5876 lea T1, [.imm0 xWrtRIP]
5877 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5878 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5879 %else
5880 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5881 %endif
5882 lea T1, [T1 + T0*2]
5883 mov A1, A2 ; A2 requires longer encoding on Windows
5884 IBT_NOTRACK
5885 call T1
5886 movdqu [A0], xmm0
5887
5888 IEMIMPL_SSE_EPILOGUE
5889 EPILOGUE_4_ARGS
5890 %assign bImm 0
5891 %rep 256
5892.imm %+ bImm:
5893 IBT_ENDBRxx_WITHOUT_NOTRACK
5894 vpinsrw xmm0, xmm0, A1_32, bImm
5895 ret
5896 %assign bImm bImm + 1
5897 %endrep
5898.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5899ENDPROC iemAImpl_vpinsrw_u128
5900
5901
5902;;
5903; pextrw instruction.
5904;
5905; @param A0 Pointer to the 16bit output operand (output).
5906; @param A1 Pointer to the media register size operand (input).
5907; @param A2 The 8-bit immediate
5908;
5909BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5910 PROLOGUE_3_ARGS
5911 IEMIMPL_SSE_PROLOGUE
5912
5913 movzx A2, A2_8 ; must clear top bits
5914 movq mm0, A1
5915 lea T1, [.imm0 xWrtRIP]
5916 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5917 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5918 %else
5919 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5920 %endif
5921 lea T1, [T1 + T0]
5922 IBT_NOTRACK
5923 call T1
5924 mov word [A0], T0_16
5925
5926 IEMIMPL_SSE_EPILOGUE
5927 EPILOGUE_3_ARGS
5928 %assign bImm 0
5929 %rep 256
5930.imm %+ bImm:
5931 IBT_ENDBRxx_WITHOUT_NOTRACK
5932 pextrw T0_32, mm0, bImm
5933 ret
5934 %assign bImm bImm + 1
5935 %endrep
5936.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5937ENDPROC iemAImpl_pextrw_u64
5938
5939BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5940 PROLOGUE_3_ARGS
5941 IEMIMPL_SSE_PROLOGUE
5942
5943 movzx A2, A2_8 ; must clear top bits
5944 movdqu xmm0, [A1]
5945 lea T1, [.imm0 xWrtRIP]
5946 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5947 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5948 %else
5949 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5950 %endif
5951 lea T1, [T1 + T0*2]
5952 IBT_NOTRACK
5953 call T1
5954 mov word [A0], T0_16
5955
5956 IEMIMPL_SSE_EPILOGUE
5957 EPILOGUE_3_ARGS
5958 %assign bImm 0
5959 %rep 256
5960.imm %+ bImm:
5961 IBT_ENDBRxx_WITHOUT_NOTRACK
5962 pextrw T0_32, xmm0, bImm
5963 ret
5964 %assign bImm bImm + 1
5965 %endrep
5966.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5967ENDPROC iemAImpl_pextrw_u128
5968
5969;;
5970; vpextrw instruction.
5971;
5972; @param A0 Pointer to the 16bit output operand (output).
5973; @param A1 Pointer to the source media register size operand (input).
5974; @param A2 The 8-bit immediate
5975;
5976BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5977 PROLOGUE_3_ARGS
5978 IEMIMPL_SSE_PROLOGUE
5979
5980 movzx A2, A2_8 ; must clear top bits
5981 movdqu xmm0, [A1]
5982 lea T1, [.imm0 xWrtRIP]
5983 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5984 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5985 %else
5986 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5987 %endif
5988 lea T1, [T1 + T0*2]
5989 IBT_NOTRACK
5990 call T1
5991 mov word [A0], T0_16
5992
5993 IEMIMPL_SSE_EPILOGUE
5994 EPILOGUE_3_ARGS
5995 %assign bImm 0
5996 %rep 256
5997.imm %+ bImm:
5998 IBT_ENDBRxx_WITHOUT_NOTRACK
5999 vpextrw T0_32, xmm0, bImm
6000 ret
6001 %assign bImm bImm + 1
6002 %endrep
6003.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6004ENDPROC iemAImpl_vpextrw_u128
6005
6006
6007;;
6008; movmskp{s,d} SSE instruction template
6009;
6010; @param 1 The SSE instruction name.
6011; @param 2 The AVX instruction name.
6012;
6013; @param A0 Pointer to the output register (output/byte sized).
6014; @param A1 Pointer to the source media register size operand (input).
6015;
6016%macro IEMIMPL_MEDIA_MOVMSK_P 2
6017BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6018 PROLOGUE_2_ARGS
6019 IEMIMPL_SSE_PROLOGUE
6020
6021 movdqu xmm0, [A1]
6022 %1 T0, xmm0
6023 mov byte [A0], T0_8
6024
6025 IEMIMPL_SSE_EPILOGUE
6026 EPILOGUE_2_ARGS
6027ENDPROC iemAImpl_ %+ %1 %+ _u128
6028
6029BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6030 PROLOGUE_2_ARGS
6031 IEMIMPL_AVX_PROLOGUE
6032
6033 movdqu xmm0, [A1]
6034 %2 T0, xmm0
6035 mov byte [A0], T0_8
6036
6037 IEMIMPL_AVX_EPILOGUE
6038 EPILOGUE_2_ARGS
6039ENDPROC iemAImpl_ %+ %2 %+ _u128
6040
6041BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6042 PROLOGUE_2_ARGS
6043 IEMIMPL_AVX_PROLOGUE
6044
6045 vmovdqu ymm0, [A1]
6046 %2 T0, ymm0
6047 mov byte [A0], T0_8
6048
6049 IEMIMPL_AVX_EPILOGUE
6050 EPILOGUE_2_ARGS
6051ENDPROC iemAImpl_ %+ %2 %+ _u256
6052%endmacro
6053
6054IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6055IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6056
6057
6058;;
6059; Restores the SSE MXCSR register with the original value.
6060;
6061; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6062; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6063; @param 2 Expression giving the address of the FXSTATE of the guest.
6064;
6065; @note Restores the stack pointer.
6066;
6067%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
6068 sub xSP, 4
6069 stmxcsr [xSP]
6070 mov T0_32, [xSP]
6071 add xSP, 4
6072 ; Merge the status bits into the original MXCSR value.
6073 mov T1_32, [%2 + X86FXSTATE.MXCSR]
6074 and T0_32, X86_MXCSR_XCPT_FLAGS
6075 or T0_32, T1_32
6076 mov [%1], T0_32
6077
6078 ldmxcsr [xSP]
6079 add xSP, 4
6080%endmacro
6081
6082
6083;;
6084; cvttsd2si instruction - 32-bit variant.
6085;
6086; @param A0 FPU context (FXSTATE or XSAVEAREA).
6087; @param A1 Where to return the MXCSR value.
6088; @param A2 Pointer to the result operand (output).
6089; @param A3 Pointer to the second operand (input).
6090;
6091BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6092 PROLOGUE_4_ARGS
6093 IEMIMPL_SSE_PROLOGUE
6094 SSE_LD_FXSTATE_MXCSR A0
6095
6096 cvttsd2si T0_32, [A3]
6097 mov dword [A2], T0_32
6098
6099 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6100 IEMIMPL_SSE_EPILOGUE
6101 EPILOGUE_4_ARGS
6102ENDPROC iemAImpl_cvttsd2si_i32_r64
6103
6104;;
6105; cvttsd2si instruction - 64-bit variant.
6106;
6107; @param A0 FPU context (FXSTATE or XSAVEAREA).
6108; @param A1 Where to return the MXCSR value.
6109; @param A2 Pointer to the result operand (output).
6110; @param A3 Pointer to the second operand (input).
6111;
6112BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6113 PROLOGUE_4_ARGS
6114 IEMIMPL_SSE_PROLOGUE
6115 SSE_LD_FXSTATE_MXCSR A0
6116
6117 cvttsd2si T0, [A3]
6118 mov qword [A2], T0
6119
6120 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6121 IEMIMPL_SSE_EPILOGUE
6122 EPILOGUE_4_ARGS
6123ENDPROC iemAImpl_cvttsd2si_i64_r64
6124
6125
6126;;
6127; cvtsd2si instruction - 32-bit variant.
6128;
6129; @param A0 FPU context (FXSTATE or XSAVEAREA).
6130; @param A1 Where to return the MXCSR value.
6131; @param A2 Pointer to the result operand (output).
6132; @param A3 Pointer to the second operand (input).
6133;
6134BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6135 PROLOGUE_4_ARGS
6136 IEMIMPL_SSE_PROLOGUE
6137 SSE_LD_FXSTATE_MXCSR A0
6138
6139 cvtsd2si T0_32, [A3]
6140 mov dword [A2], T0_32
6141
6142 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6143 IEMIMPL_SSE_EPILOGUE
6144 EPILOGUE_4_ARGS
6145ENDPROC iemAImpl_cvtsd2si_i32_r64
6146
6147;;
6148; cvtsd2si instruction - 64-bit variant.
6149;
6150; @param A0 FPU context (FXSTATE or XSAVEAREA).
6151; @param A1 Where to return the MXCSR value.
6152; @param A2 Pointer to the result operand (output).
6153; @param A3 Pointer to the second operand (input).
6154;
6155BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6156 PROLOGUE_4_ARGS
6157 IEMIMPL_SSE_PROLOGUE
6158 SSE_LD_FXSTATE_MXCSR A0
6159
6160 cvtsd2si T0, [A3]
6161 mov qword [A2], T0
6162
6163 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6164 IEMIMPL_SSE_EPILOGUE
6165 EPILOGUE_4_ARGS
6166ENDPROC iemAImpl_cvtsd2si_i64_r64
6167
6168
6169;;
6170; cvttss2si instruction - 32-bit variant.
6171;
6172; @param A0 FPU context (FXSTATE or XSAVEAREA).
6173; @param A1 Where to return the MXCSR value.
6174; @param A2 Pointer to the result operand (output).
6175; @param A3 Pointer to the second operand (input).
6176;
6177BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6178 PROLOGUE_4_ARGS
6179 IEMIMPL_SSE_PROLOGUE
6180 SSE_LD_FXSTATE_MXCSR A0
6181
6182 cvttss2si T0_32, [A3]
6183 mov dword [A2], T0_32
6184
6185 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6186 IEMIMPL_SSE_EPILOGUE
6187 EPILOGUE_4_ARGS
6188ENDPROC iemAImpl_cvttss2si_i32_r32
6189
6190;;
6191; cvttss2si instruction - 64-bit variant.
6192;
6193; @param A0 FPU context (FXSTATE or XSAVEAREA).
6194; @param A1 Where to return the MXCSR value.
6195; @param A2 Pointer to the result operand (output).
6196; @param A3 Pointer to the second operand (input).
6197;
6198BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6199 PROLOGUE_4_ARGS
6200 IEMIMPL_SSE_PROLOGUE
6201 SSE_LD_FXSTATE_MXCSR A0
6202
6203 cvttss2si T0, [A3]
6204 mov qword [A2], T0
6205
6206 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6207 IEMIMPL_SSE_EPILOGUE
6208 EPILOGUE_4_ARGS
6209ENDPROC iemAImpl_cvttss2si_i64_r32
6210
6211
6212;;
6213; cvtss2si instruction - 32-bit variant.
6214;
6215; @param A0 FPU context (FXSTATE or XSAVEAREA).
6216; @param A1 Where to return the MXCSR value.
6217; @param A2 Pointer to the result operand (output).
6218; @param A3 Pointer to the second operand (input).
6219;
6220BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6221 PROLOGUE_4_ARGS
6222 IEMIMPL_SSE_PROLOGUE
6223 SSE_LD_FXSTATE_MXCSR A0
6224
6225 cvtss2si T0_32, [A3]
6226 mov dword [A2], T0_32
6227
6228 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6229 IEMIMPL_SSE_EPILOGUE
6230 EPILOGUE_4_ARGS
6231ENDPROC iemAImpl_cvtss2si_i32_r32
6232
6233;;
6234; cvtss2si instruction - 64-bit variant.
6235;
6236; @param A0 FPU context (FXSTATE or XSAVEAREA).
6237; @param A1 Where to return the MXCSR value.
6238; @param A2 Pointer to the result operand (output).
6239; @param A3 Pointer to the second operand (input).
6240;
6241BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6242 PROLOGUE_4_ARGS
6243 IEMIMPL_SSE_PROLOGUE
6244 SSE_LD_FXSTATE_MXCSR A0
6245
6246 cvtss2si T0, [A3]
6247 mov qword [A2], T0
6248
6249 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6250 IEMIMPL_SSE_EPILOGUE
6251 EPILOGUE_4_ARGS
6252ENDPROC iemAImpl_cvtss2si_i64_r32
6253
6254
6255;;
6256; cvtsi2ss instruction - 32-bit variant.
6257;
6258; @param A0 FPU context (FXSTATE or XSAVEAREA).
6259; @param A1 Where to return the MXCSR value.
6260; @param A2 Pointer to the result operand (output).
6261; @param A3 Pointer to the second operand (input).
6262;
6263BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6264 PROLOGUE_4_ARGS
6265 IEMIMPL_SSE_PROLOGUE
6266 SSE_LD_FXSTATE_MXCSR A0
6267
6268 cvtsi2ss xmm0, dword [A3]
6269 movd dword [A2], xmm0
6270
6271 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6272 IEMIMPL_SSE_EPILOGUE
6273 EPILOGUE_4_ARGS
6274ENDPROC iemAImpl_cvtsi2ss_r32_i32
6275
6276;;
6277; cvtsi2ss instruction - 64-bit variant.
6278;
6279; @param A0 FPU context (FXSTATE or XSAVEAREA).
6280; @param A1 Where to return the MXCSR value.
6281; @param A2 Pointer to the result operand (output).
6282; @param A3 Pointer to the second operand (input).
6283;
6284BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6285 PROLOGUE_4_ARGS
6286 IEMIMPL_SSE_PROLOGUE
6287 SSE_LD_FXSTATE_MXCSR A0
6288
6289 cvtsi2ss xmm0, qword [A3]
6290 movd dword [A2], xmm0
6291
6292 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6293 IEMIMPL_SSE_EPILOGUE
6294 EPILOGUE_4_ARGS
6295ENDPROC iemAImpl_cvtsi2ss_r32_i64
6296
6297
6298;;
6299; cvtsi2sd instruction - 32-bit variant.
6300;
6301; @param A0 FPU context (FXSTATE or XSAVEAREA).
6302; @param A1 Where to return the MXCSR value.
6303; @param A2 Pointer to the result operand (output).
6304; @param A3 Pointer to the second operand (input).
6305;
6306BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6307 PROLOGUE_4_ARGS
6308 IEMIMPL_SSE_PROLOGUE
6309 SSE_LD_FXSTATE_MXCSR A0
6310
6311 cvtsi2sd xmm0, dword [A3]
6312 movq [A2], xmm0
6313
6314 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6315 IEMIMPL_SSE_EPILOGUE
6316 EPILOGUE_4_ARGS
6317ENDPROC iemAImpl_cvtsi2sd_r64_i32
6318
6319;;
6320; cvtsi2sd instruction - 64-bit variant.
6321;
6322; @param A0 FPU context (FXSTATE or XSAVEAREA).
6323; @param A1 Where to return the MXCSR value.
6324; @param A2 Pointer to the result operand (output).
6325; @param A3 Pointer to the second operand (input).
6326;
6327BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6328 PROLOGUE_4_ARGS
6329 IEMIMPL_SSE_PROLOGUE
6330 SSE_LD_FXSTATE_MXCSR A0
6331
6332 cvtsi2sd xmm0, qword [A3]
6333 movq [A2], xmm0
6334
6335 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6336 IEMIMPL_SSE_EPILOGUE
6337 EPILOGUE_4_ARGS
6338ENDPROC iemAImpl_cvtsi2sd_r64_i64
6339
6340
6341;;
6342; Initialize the SSE MXCSR register using the guest value partially to
6343; account for rounding mode.
6344;
6345; @uses 4 bytes of stack to save the original value, T0.
6346; @param 1 Expression giving the address of the MXCSR register of the guest.
6347;
6348%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6349 sub xSP, 4
6350
6351 stmxcsr [xSP]
6352 mov T0_32, [%1]
6353 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6354 or T0_32, X86_MXCSR_XCPT_MASK
6355 sub xSP, 4
6356 mov [xSP], T0_32
6357 ldmxcsr [xSP]
6358 add xSP, 4
6359%endmacro
6360
6361
6362;;
6363; Restores the SSE MXCSR register with the original value.
6364;
6365; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6366; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6367;
6368; @note Restores the stack pointer.
6369;
6370%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6371 sub xSP, 4
6372 stmxcsr [xSP]
6373 mov T0_32, [xSP]
6374 add xSP, 4
6375 ; Merge the status bits into the original MXCSR value.
6376 mov T1_32, [%1]
6377 and T0_32, X86_MXCSR_XCPT_FLAGS
6378 or T0_32, T1_32
6379 mov [%1], T0_32
6380
6381 ldmxcsr [xSP]
6382 add xSP, 4
6383%endmacro
6384
6385
6386;
6387; UCOMISS (SSE)
6388;
6389; @param A0 Pointer to the MXCSR value (input/output).
6390; @param A1 Pointer to the EFLAGS value (input/output).
6391; @param A2 Pointer to the first source operand (aka readonly destination).
6392; @param A3 Pointer to the second source operand.
6393;
6394BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6395 PROLOGUE_4_ARGS
6396 IEMIMPL_SSE_PROLOGUE
6397 SSE_LD_FXSTATE_MXCSR_ONLY A0
6398
6399 movdqu xmm0, [A2]
6400 movdqu xmm1, [A3]
6401 ucomiss xmm0, xmm1
6402 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6403
6404 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6405 IEMIMPL_SSE_EPILOGUE
6406 EPILOGUE_4_ARGS
6407ENDPROC iemAImpl_ucomiss_u128
6408
6409BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6410 PROLOGUE_4_ARGS
6411 IEMIMPL_SSE_PROLOGUE
6412 SSE_LD_FXSTATE_MXCSR_ONLY A0
6413
6414 movdqu xmm0, [A2]
6415 movdqu xmm1, [A3]
6416 vucomiss xmm0, xmm1
6417 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6418
6419 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6420 IEMIMPL_SSE_EPILOGUE
6421 EPILOGUE_4_ARGS
6422ENDPROC iemAImpl_vucomiss_u128
6423
6424
6425;
6426; UCOMISD (SSE)
6427;
6428; @param A0 Pointer to the MXCSR value (input/output).
6429; @param A1 Pointer to the EFLAGS value (input/output).
6430; @param A2 Pointer to the first source operand (aka readonly destination).
6431; @param A3 Pointer to the second source operand.
6432;
6433BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6434 PROLOGUE_4_ARGS
6435 IEMIMPL_SSE_PROLOGUE
6436 SSE_LD_FXSTATE_MXCSR_ONLY A0
6437
6438 movdqu xmm0, [A2]
6439 movdqu xmm1, [A3]
6440 ucomisd xmm0, xmm1
6441 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6442
6443 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6444 IEMIMPL_SSE_EPILOGUE
6445 EPILOGUE_4_ARGS
6446ENDPROC iemAImpl_ucomisd_u128
6447
6448BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6449 PROLOGUE_4_ARGS
6450 IEMIMPL_SSE_PROLOGUE
6451 SSE_LD_FXSTATE_MXCSR_ONLY A0
6452
6453 movdqu xmm0, [A2]
6454 movdqu xmm1, [A3]
6455 vucomisd xmm0, xmm1
6456 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6457
6458 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6459 IEMIMPL_SSE_EPILOGUE
6460 EPILOGUE_4_ARGS
6461ENDPROC iemAImpl_vucomisd_u128
6462
6463;
6464; COMISS (SSE)
6465;
6466; @param A0 Pointer to the MXCSR value (input/output).
6467; @param A1 Pointer to the EFLAGS value (input/output).
6468; @param A2 Pointer to the first source operand (aka readonly destination).
6469; @param A3 Pointer to the second source operand.
6470;
6471BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6472 PROLOGUE_4_ARGS
6473 IEMIMPL_SSE_PROLOGUE
6474 SSE_LD_FXSTATE_MXCSR_ONLY A0
6475
6476 movdqu xmm0, [A2]
6477 movdqu xmm1, [A3]
6478 comiss xmm0, xmm1
6479 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6480
6481 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6482 IEMIMPL_SSE_EPILOGUE
6483 EPILOGUE_4_ARGS
6484ENDPROC iemAImpl_comiss_u128
6485
6486BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6487 PROLOGUE_4_ARGS
6488 IEMIMPL_SSE_PROLOGUE
6489 SSE_LD_FXSTATE_MXCSR_ONLY A0
6490
6491 movdqu xmm0, [A2]
6492 movdqu xmm1, [A3]
6493 vcomiss xmm0, xmm1
6494 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6495
6496 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6497 IEMIMPL_SSE_EPILOGUE
6498 EPILOGUE_4_ARGS
6499ENDPROC iemAImpl_vcomiss_u128
6500
6501
6502;
6503; COMISD (SSE)
6504;
6505; @param A0 Pointer to the MXCSR value (input/output).
6506; @param A1 Pointer to the EFLAGS value (input/output).
6507; @param A2 Pointer to the first source operand (aka readonly destination).
6508; @param A3 Pointer to the second source operand.
6509;
6510BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6511 PROLOGUE_4_ARGS
6512 IEMIMPL_SSE_PROLOGUE
6513 SSE_LD_FXSTATE_MXCSR_ONLY A0
6514
6515 movdqu xmm0, [A2]
6516 movdqu xmm1, [A3]
6517 comisd xmm0, xmm1
6518 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6519
6520 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6521 IEMIMPL_SSE_EPILOGUE
6522 EPILOGUE_4_ARGS
6523ENDPROC iemAImpl_comisd_u128
6524
6525BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6526 PROLOGUE_4_ARGS
6527 IEMIMPL_SSE_PROLOGUE
6528 SSE_LD_FXSTATE_MXCSR_ONLY A0
6529
6530 movdqu xmm0, [A2]
6531 movdqu xmm1, [A3]
6532 vcomisd xmm0, xmm1
6533 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6534
6535 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6536 IEMIMPL_SSE_EPILOGUE
6537 EPILOGUE_4_ARGS
6538ENDPROC iemAImpl_vcomisd_u128
6539
6540
6541;;
6542; Need to move this as well somewhere better?
6543;
6544struc IEMMEDIAF2XMMSRC
6545 .uSrc1 resd 4
6546 .uSrc2 resd 4
6547endstruc
6548
6549
6550;
6551; CMPPS (SSE)
6552;
6553; @param A0 Pointer to the MXCSR value (input/output).
6554; @param A1 Pointer to the first media register size operand (output).
6555; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6556; @param A3 The 8-bit immediate (input).
6557;
6558BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6559 PROLOGUE_4_ARGS
6560 IEMIMPL_SSE_PROLOGUE
6561 SSE_LD_FXSTATE_MXCSR_ONLY A0
6562
6563 movzx A3, A3_8 ; must clear top bits
6564 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6565 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6566 lea T1, [.imm0 xWrtRIP]
6567 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6568 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6569 %else
6570 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6571 %endif
6572 lea T1, [T1 + T0]
6573 IBT_NOTRACK
6574 call T1
6575 movdqu [A1], xmm0
6576
6577 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6578 IEMIMPL_SSE_EPILOGUE
6579 EPILOGUE_4_ARGS
6580 %assign bImm 0
6581 %rep 256
6582.imm %+ bImm:
6583 IBT_ENDBRxx_WITHOUT_NOTRACK
6584 cmpps xmm0, xmm1, bImm
6585 ret
6586 %assign bImm bImm + 1
6587 %endrep
6588.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6589ENDPROC iemAImpl_cmpps_u128
6590
6591;;
6592; SSE instructions with 8-bit immediates of the form
6593; xxx xmm1, xmm2, imm8.
6594; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6595; register.
6596;
6597; @param 1 The instruction name.
6598;
6599; @param A0 Pointer to the MXCSR value (input/output).
6600; @param A1 Pointer to the first media register size operand (output).
6601; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6602; @param A3 The 8-bit immediate (input).
6603;
6604%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6606 PROLOGUE_4_ARGS
6607 IEMIMPL_SSE_PROLOGUE
6608 SSE_LD_FXSTATE_MXCSR_ONLY A0
6609
6610 movzx A3, A3_8 ; must clear top bits
6611 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6612 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6613 lea T1, [.imm0 xWrtRIP]
6614 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6615 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6616 %else
6617 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6618 %endif
6619 lea T1, [T1 + T0*2]
6620 IBT_NOTRACK
6621 call T1
6622 movdqu [A1], xmm0
6623
6624 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6625 IEMIMPL_SSE_EPILOGUE
6626 EPILOGUE_4_ARGS
6627 %assign bImm 0
6628 %rep 256
6629.imm %+ bImm:
6630 IBT_ENDBRxx_WITHOUT_NOTRACK
6631 %1 xmm0, xmm1, bImm
6632 ret
6633 %assign bImm bImm + 1
6634 %endrep
6635.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6636ENDPROC iemAImpl_ %+ %1 %+ _u128
6637%endmacro
6638
6639IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6640IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6641IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6642
6643;;
6644; SSE instructions with 8-bit immediates of the form
6645; xxx xmm1, xmm2, imm8.
6646; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6647; register.
6648;
6649; @param 1 The instruction name.
6650;
6651; @param A0 Pointer to the MXCSR value (input/output).
6652; @param A1 Pointer to the first media register size operand (output).
6653; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6654; @param A3 The 8-bit immediate (input).
6655;
6656%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6658 PROLOGUE_4_ARGS
6659 IEMIMPL_SSE_PROLOGUE
6660 SSE_LD_FXSTATE_MXCSR_ONLY A0
6661
6662 movzx A3, A3_8 ; must clear top bits
6663 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6664 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6665 lea T1, [.imm0 xWrtRIP]
6666 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6667 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6668 lea T1, [T1 + T0*4]
6669 %else
6670 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6671 %endif
6672 IBT_NOTRACK
6673 call T1
6674 movdqu [A1], xmm0
6675
6676 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6677 IEMIMPL_SSE_EPILOGUE
6678 EPILOGUE_4_ARGS
6679 %assign bImm 0
6680 %rep 256
6681.imm %+ bImm:
6682 IBT_ENDBRxx_WITHOUT_NOTRACK
6683 %1 xmm0, xmm1, bImm
6684 ret
6685 int3
6686 %assign bImm bImm + 1
6687 %endrep
6688.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6689ENDPROC iemAImpl_ %+ %1 %+ _u128
6690%endmacro
6691
6692IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6693IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6694IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6695IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6696IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6697IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6698
6699
6700;;
6701; SSE instructions of the form
6702; xxx mm, xmm.
6703; and we need to load and save the MXCSR register.
6704;
6705; @param 1 The instruction name.
6706;
6707; @param A0 Pointer to the MXCSR value (input/output).
6708; @param A1 Pointer to the first MMX register sized operand (output).
6709; @param A2 Pointer to the media register sized operand (input).
6710;
6711%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6712BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6713 PROLOGUE_3_ARGS
6714 IEMIMPL_SSE_PROLOGUE
6715 SSE_LD_FXSTATE_MXCSR_ONLY A0
6716
6717 movdqu xmm0, [A2]
6718 %1 mm0, xmm0
6719 movq [A1], mm0
6720
6721 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6722 IEMIMPL_SSE_EPILOGUE
6723 EPILOGUE_3_ARGS
6724ENDPROC iemAImpl_ %+ %1 %+ _u128
6725%endmacro
6726
6727IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6728IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6729
6730;;
6731; SSE instructions of the form
6732; xxx xmm, xmm/m64.
6733; and we need to load and save the MXCSR register.
6734;
6735; @param 1 The instruction name.
6736;
6737; @param A0 Pointer to the MXCSR value (input/output).
6738; @param A1 Pointer to the first media register sized operand (input/output).
6739; @param A2 The 64bit source value from a MMX media register (input)
6740;
6741%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6742BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6743 PROLOGUE_3_ARGS
6744 IEMIMPL_SSE_PROLOGUE
6745 SSE_LD_FXSTATE_MXCSR_ONLY A0
6746
6747 movdqu xmm0, [A1]
6748 movq mm0, A2
6749 %1 xmm0, mm0
6750 movdqu [A1], xmm0
6751
6752 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6753 IEMIMPL_SSE_EPILOGUE
6754 EPILOGUE_3_ARGS
6755ENDPROC iemAImpl_ %+ %1 %+ _u128
6756%endmacro
6757
6758IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6759IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6760
6761;;
6762; SSE instructions of the form
6763; xxx mm, xmm/m64.
6764; and we need to load and save the MXCSR register.
6765;
6766; @param 1 The instruction name.
6767;
6768; @param A0 Pointer to the MXCSR value (input/output).
6769; @param A1 Pointer to the first MMX media register sized operand (output).
6770; @param A2 The 64bit source value (input).
6771;
6772%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6774 PROLOGUE_3_ARGS
6775 IEMIMPL_SSE_PROLOGUE
6776 SSE_LD_FXSTATE_MXCSR_ONLY A0
6777
6778 movq xmm0, A2
6779 %1 mm0, xmm0
6780 movq [A1], mm0
6781
6782 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6783 IEMIMPL_SSE_EPILOGUE
6784 EPILOGUE_3_ARGS
6785ENDPROC iemAImpl_ %+ %1 %+ _u128
6786%endmacro
6787
6788IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6789IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6790
6791;
6792; All forms of RDRAND and RDSEED
6793;
6794; @param A0 Pointer to the destination operand.
6795; @param A1 Pointer to the EFLAGS value (input/output).
6796;
6797%macro IEMIMPL_RDRAND_RDSEED 3
6798BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6799 PROLOGUE_2_ARGS
6800
6801 %1 %2
6802 mov [A0], %2
6803 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6804
6805 EPILOGUE_2_ARGS
6806ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6807%endmacro
6808
6809IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6810IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6811IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6812IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6813IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6814IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6815
6816
6817;;
6818; sha1rnds4 xmm1, xmm2, imm8.
6819;
6820; @param 1 The instruction name.
6821;
6822; @param A0 Pointer to the first media register size operand (input/output).
6823; @param A1 Pointer to the second source media register size operand (input).
6824; @param A2 The 8-bit immediate
6825;
6826BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6827 PROLOGUE_3_ARGS
6828 IEMIMPL_SSE_PROLOGUE
6829
6830 movzx A2, A2_8 ; must clear top bits
6831 movdqu xmm0, [A0]
6832 movdqu xmm1, [A1]
6833 lea T1, [.imm0 xWrtRIP]
6834 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6835 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6836 %else
6837 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6838 %endif
6839 lea T1, [T1 + T0*2]
6840 IBT_NOTRACK
6841 call T1
6842 movdqu [A0], xmm0
6843
6844 IEMIMPL_SSE_EPILOGUE
6845 EPILOGUE_3_ARGS
6846 %assign bImm 0
6847 %rep 256
6848.imm %+ bImm:
6849 IBT_ENDBRxx_WITHOUT_NOTRACK
6850 sha1rnds4 xmm0, xmm1, bImm
6851 ret
6852 %assign bImm bImm + 1
6853 %endrep
6854.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6855ENDPROC iemAImpl_sha1rnds4_u128
6856
6857
6858;;
6859; sha256rnds2 xmm1, xmm2, <XMM0>.
6860;
6861; @param 1 The instruction name.
6862;
6863; @param A0 Pointer to the first media register size operand (input/output).
6864; @param A1 Pointer to the second source media register size operand (input).
6865; @param A2 Pointer to the implicit XMM0 constants (input).
6866;
6867BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6868 PROLOGUE_3_ARGS
6869 IEMIMPL_SSE_PROLOGUE
6870
6871 movdqu xmm0, [A2]
6872 movdqu xmm1, [A0]
6873 movdqu xmm2, [A1]
6874 sha256rnds2 xmm1, xmm2
6875 movdqu [A0], xmm1
6876
6877 IEMIMPL_SSE_EPILOGUE
6878 EPILOGUE_3_ARGS
6879ENDPROC iemAImpl_sha256rnds2_u128
6880
6881
6882;
6883; 32-bit forms of ADCX and ADOX
6884;
6885; @param A0 Pointer to the destination operand (input/output).
6886; @param A1 32-bit source operand 1 (input).
6887; @param A2 Pointer to the EFLAGS value (input/output).
6888;
6889%macro IEMIMPL_ADX_32 2
6890BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6891 PROLOGUE_4_ARGS
6892
6893 IEM_LOAD_FLAGS A2, %2, 0
6894 %1 A1_32, [A0]
6895 mov [A0], A1_32
6896 IEM_SAVE_FLAGS A2, %2, 0
6897
6898 EPILOGUE_4_ARGS
6899ENDPROC iemAImpl_ %+ %1 %+ _u32
6900%endmacro
6901
6902;
6903; 64-bit forms of ADCX and ADOX
6904;
6905; @param A0 Pointer to the destination operand (input/output).
6906; @param A1 64-bit source operand 1 (input).
6907; @param A2 Pointer to the EFLAGS value (input/output).
6908;
6909%macro IEMIMPL_ADX_64 2
6910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6911 PROLOGUE_4_ARGS
6912
6913 IEM_LOAD_FLAGS A2, %2, 0
6914 %1 A1, [A0]
6915 mov [A0], A1
6916 IEM_SAVE_FLAGS A2, %2, 0
6917
6918 EPILOGUE_4_ARGS
6919ENDPROC iemAImpl_ %+ %1 %+ _u64
6920%endmacro
6921
6922IEMIMPL_ADX_32 adcx, X86_EFL_CF
6923IEMIMPL_ADX_64 adcx, X86_EFL_CF
6924
6925IEMIMPL_ADX_32 adox, X86_EFL_OF
6926IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette