VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 102727

Last change on this file since 102727 was 102656, checked in by vboxsync, 14 months ago

IEMAllAImpl.asm: Windows and SysV ABI does not guarantee that top bits of byte/word/dword arguments passed in 64-bit registers are cleared. We must clear them ourselves!

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 192.0 KB
Line 
1; $Id: IEMAllAImpl.asm 102656 2023-12-20 16:33:54Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283;
284%macro IEM_MAYBE_LOAD_FLAGS 3
285 ;%if (%3) != 0
286 pushf ; store current flags
287 mov T0_32, [%1] ; load the guest flags
288 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
289 and T0_32, (%2 | %3) ; select the modified and undefined flags.
290 or [xSP], T0 ; merge guest flags with host flags.
291 popf ; load the mixed flags.
292 ;%endif
293%endmacro
294
295;;
296; Load the relevant flags from [%1].
297;
298; @remarks Clobbers T0, stack. Changes EFLAGS.
299; @param A2 The register pointing to the flags.
300; @param 1 The parameter (A0..A3) pointing to the eflags.
301; @param 2 The set of flags to load.
302; @param 3 The set of undefined flags.
303;
304%macro IEM_LOAD_FLAGS 3
305 pushf ; store current flags
306 mov T0_32, [%1] ; load the guest flags
307 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
308 and T0_32, (%2 | %3) ; select the modified and undefined flags.
309 or [xSP], T0 ; merge guest flags with host flags.
310 popf ; load the mixed flags.
311%endmacro
312
313;;
314; Update the flag.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 The mask of undefined flags to (maybe) save.
320;
321%macro IEM_SAVE_FLAGS 3
322 %if (%2 | %3) != 0
323 pushf
324 pop T1
325 mov T0_32, [%1] ; flags
326 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
327 and T1_32, (%2 | %3) ; select the modified and undefined flags.
328 or T0_32, T1_32 ; combine the flags.
329 mov [%1], T0_32 ; save the flags.
330 %endif
331%endmacro
332
333;;
334; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
335;
336; @remarks Clobbers T0, T1, stack.
337; @param 1 The register pointing to the EFLAGS.
338; @param 2 The mask of modified flags to save.
339; @param 3 Mask of additional flags to always clear
340; @param 4 Mask of additional flags to always set.
341;
342%macro IEM_SAVE_AND_ADJUST_FLAGS 4
343 %if (%2 | %3 | %4) != 0
344 pushf
345 pop T1
346 mov T0_32, [%1] ; load flags.
347 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
348 and T1_32, (%2) ; select the modified flags.
349 or T0_32, T1_32 ; combine the flags.
350 %if (%4) != 0
351 or T0_32, %4 ; add the always set flags.
352 %endif
353 mov [%1], T0_32 ; save the result.
354 %endif
355%endmacro
356
357;;
358; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
359; signed input (%4[%5]) and parity index (%6).
360;
361; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
362; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
363; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
364;
365; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
366; @param 1 The register pointing to the EFLAGS.
367; @param 2 The mask of modified flags to save.
368; @param 3 Mask of additional flags to always clear
369; @param 4 The result register to set SF by.
370; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
371; @param 6 The (full) register containing the parity table index. Will be modified!
372
373%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
374 %ifdef RT_ARCH_AMD64
375 pushf
376 pop T2
377 %else
378 push T0
379 pushf
380 pop T0
381 %endif
382 mov T1_32, [%1] ; load flags.
383 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
384 %ifdef RT_ARCH_AMD64
385 and T2_32, (%2) ; select the modified flags.
386 or T1_32, T2_32 ; combine the flags.
387 %else
388 and T0_32, (%2) ; select the modified flags.
389 or T1_32, T0_32 ; combine the flags.
390 pop T0
391 %endif
392
393 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
394 bt %4, %5 - 1
395 jnc %%sf_clear
396 or T1_32, X86_EFL_SF
397 %%sf_clear:
398
399 ; Parity last.
400 and %6, 0xff
401 %ifdef RT_ARCH_AMD64
402 lea T2, [NAME(g_afParity) xWrtRIP]
403 or T1_8, [T2 + %6]
404 %else
405 or T1_8, [NAME(g_afParity) + %6]
406 %endif
407
408 mov [%1], T1_32 ; save the result.
409%endmacro
410
411;;
412; Calculates the new EFLAGS using fixed clear and set bit masks.
413;
414; @remarks Clobbers T0.
415; @param 1 The register pointing to the EFLAGS.
416; @param 2 Mask of additional flags to always clear
417; @param 3 Mask of additional flags to always set.
418;
419%macro IEM_ADJUST_FLAGS 3
420 %if (%2 | %3) != 0
421 mov T0_32, [%1] ; Load flags.
422 %if (%2) != 0
423 and T0_32, ~(%2) ; Remove the always cleared flags.
424 %endif
425 %if (%3) != 0
426 or T0_32, %3 ; Add the always set flags.
427 %endif
428 mov [%1], T0_32 ; Save the result.
429 %endif
430%endmacro
431
432;;
433; Calculates the new EFLAGS using fixed clear and set bit masks.
434;
435; @remarks Clobbers T0, %4, EFLAGS.
436; @param 1 The register pointing to the EFLAGS.
437; @param 2 Mask of additional flags to always clear
438; @param 3 Mask of additional flags to always set.
439; @param 4 The (full) register containing the parity table index. Will be modified!
440;
441%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
442 mov T0_32, [%1] ; Load flags.
443 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
444 %if (%3) != 0
445 or T0_32, %3 ; Add the always set flags.
446 %endif
447 and %4, 0xff
448 %ifdef RT_ARCH_AMD64
449 lea T2, [NAME(g_afParity) xWrtRIP]
450 or T0_8, [T2 + %4]
451 %else
452 or T0_8, [NAME(g_afParity) + %4]
453 %endif
454 mov [%1], T0_32 ; Save the result.
455%endmacro
456
457
458;;
459; Checks that the size expression %1 matches %2 adjusted according to
460; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
461; @param 1 The jump array size assembly expression.
462; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
463;
464%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
465 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
466 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
467 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
468 %else
469 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
470 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
471 %endif
472%endmacro
473
474
475;*********************************************************************************************************************************
476;* External Symbols *
477;*********************************************************************************************************************************
478extern NAME(g_afParity)
479
480
481;;
482; Macro for implementing a binary operator.
483;
484; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
485; variants, except on 32-bit system where the 64-bit accesses requires hand
486; coding.
487;
488; All the functions takes a pointer to the destination memory operand in A0,
489; the source register operand in A1 and a pointer to eflags in A2.
490;
491; @param 1 The instruction mnemonic.
492; @param 2 Non-zero if there should be a locked version.
493; @param 3 The modified flags.
494; @param 4 The undefined flags.
495;
496%macro IEMIMPL_BIN_OP 4
497BEGINCODE
498BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
499 PROLOGUE_3_ARGS
500 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
501 %1 byte [A0], A1_8
502 IEM_SAVE_FLAGS A2, %3, %4
503 EPILOGUE_3_ARGS
504ENDPROC iemAImpl_ %+ %1 %+ _u8
505
506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
507 PROLOGUE_3_ARGS
508 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
509 %1 word [A0], A1_16
510 IEM_SAVE_FLAGS A2, %3, %4
511 EPILOGUE_3_ARGS
512ENDPROC iemAImpl_ %+ %1 %+ _u16
513
514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
515 PROLOGUE_3_ARGS
516 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
517 %1 dword [A0], A1_32
518 IEM_SAVE_FLAGS A2, %3, %4
519 EPILOGUE_3_ARGS
520ENDPROC iemAImpl_ %+ %1 %+ _u32
521
522 %ifdef RT_ARCH_AMD64
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 qword [A0], A1
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS_EX 8
529ENDPROC iemAImpl_ %+ %1 %+ _u64
530 %endif ; RT_ARCH_AMD64
531
532 %if %2 != 0 ; locked versions requested?
533
534BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
535 PROLOGUE_3_ARGS
536 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
537 lock %1 byte [A0], A1_8
538 IEM_SAVE_FLAGS A2, %3, %4
539 EPILOGUE_3_ARGS
540ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
541
542BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
543 PROLOGUE_3_ARGS
544 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
545 lock %1 word [A0], A1_16
546 IEM_SAVE_FLAGS A2, %3, %4
547 EPILOGUE_3_ARGS
548ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
549
550BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
551 PROLOGUE_3_ARGS
552 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
553 lock %1 dword [A0], A1_32
554 IEM_SAVE_FLAGS A2, %3, %4
555 EPILOGUE_3_ARGS
556ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
557
558 %ifdef RT_ARCH_AMD64
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 qword [A0], A1
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS_EX 8
565ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
566 %endif ; RT_ARCH_AMD64
567 %endif ; locked
568%endmacro
569
570; instr,lock, modified-flags, undefined flags
571IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
572IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
573IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
574IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
575IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
576IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
577IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
578IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
579IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
580
581
582;;
583; Macro for implementing a binary operator, VEX variant with separate input/output.
584;
585; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
586; where the 64-bit accesses requires hand coding.
587;
588; All the functions takes a pointer to the destination memory operand in A0,
589; the first source register operand in A1, the second source register operand
590; in A2 and a pointer to eflags in A3.
591;
592; @param 1 The instruction mnemonic.
593; @param 2 The modified flags.
594; @param 3 The undefined flags.
595;
596%macro IEMIMPL_VEX_BIN_OP 3
597BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
598 PROLOGUE_4_ARGS
599 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
600 %1 T0_32, A1_32, A2_32
601 mov [A0], T0_32
602 IEM_SAVE_FLAGS A3, %2, %3
603 EPILOGUE_4_ARGS
604ENDPROC iemAImpl_ %+ %1 %+ _u32
605
606 %ifdef RT_ARCH_AMD64
607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
608 PROLOGUE_4_ARGS
609 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
610 %1 T0, A1, A2
611 mov [A0], T0
612 IEM_SAVE_FLAGS A3, %2, %3
613 EPILOGUE_4_ARGS
614ENDPROC iemAImpl_ %+ %1 %+ _u64
615 %endif ; RT_ARCH_AMD64
616%endmacro
617
618; instr, modified-flags, undefined-flags
619IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
620IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
621IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622
623;;
624; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
625;
626; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
627; where the 64-bit accesses requires hand coding.
628;
629; All the functions takes a pointer to the destination memory operand in A0,
630; the source register operand in A1 and a pointer to eflags in A2.
631;
632; @param 1 The instruction mnemonic.
633; @param 2 The modified flags.
634; @param 3 The undefined flags.
635;
636%macro IEMIMPL_VEX_BIN_OP_2 3
637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
638 PROLOGUE_4_ARGS
639 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
640 mov T0_32, [A0]
641 %1 T0_32, A1_32
642 mov [A0], T0_32
643 IEM_SAVE_FLAGS A2, %2, %3
644 EPILOGUE_4_ARGS
645ENDPROC iemAImpl_ %+ %1 %+ _u32
646
647 %ifdef RT_ARCH_AMD64
648BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
649 PROLOGUE_4_ARGS
650 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
651 mov T0, [A0]
652 %1 T0, A1
653 mov [A0], T0
654 IEM_SAVE_FLAGS A2, %2, %3
655 EPILOGUE_4_ARGS
656ENDPROC iemAImpl_ %+ %1 %+ _u64
657 %endif ; RT_ARCH_AMD64
658%endmacro
659
660; instr, modified-flags, undefined-flags
661IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
662IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
663IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664
665
666;;
667; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
668;
669; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
670; where the 64-bit accesses requires hand coding.
671;
672; All the functions takes a pointer to the destination memory operand in A0,
673; the first source register operand in A1, the second source register operand
674; in A2 and a pointer to eflags in A3.
675;
676; @param 1 The instruction mnemonic.
677; @param 2 Fallback instruction if applicable.
678; @param 3 Whether to emit fallback or not.
679;
680%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
681BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
682 PROLOGUE_3_ARGS
683 %1 T0_32, A1_32, A2_32
684 mov [A0], T0_32
685 EPILOGUE_3_ARGS
686ENDPROC iemAImpl_ %+ %1 %+ _u32
687
688 %if %3
689BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
690 PROLOGUE_3_ARGS
691 %ifdef ASM_CALL64_GCC
692 mov cl, A2_8
693 %2 A1_32, cl
694 mov [A0], A1_32
695 %else
696 xchg A2, A0
697 %2 A1_32, cl
698 mov [A2], A1_32
699 %endif
700 EPILOGUE_3_ARGS
701ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
702 %endif
703
704 %ifdef RT_ARCH_AMD64
705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
706 PROLOGUE_3_ARGS
707 %1 T0, A1, A2
708 mov [A0], T0
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u64
711
712 %if %3
713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
714 PROLOGUE_3_ARGS
715 %ifdef ASM_CALL64_GCC
716 mov cl, A2_8
717 %2 A1, cl
718 mov [A0], A1_32
719 %else
720 xchg A2, A0
721 %2 A1, cl
722 mov [A2], A1_32
723 %endif
724 mov [A0], A1
725 EPILOGUE_3_ARGS
726ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
727 %endif
728 %endif ; RT_ARCH_AMD64
729%endmacro
730
731; instr, fallback instr, emit fallback
732IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
733IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
734IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
735IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
736IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
737
738
739;
740; RORX uses a immediate byte for the shift count, so we only do
741; fallback implementation of that one.
742;
743BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
744 PROLOGUE_3_ARGS
745 %ifdef ASM_CALL64_GCC
746 mov cl, A2_8
747 ror A1_32, cl
748 mov [A0], A1_32
749 %else
750 xchg A2, A0
751 ror A1_32, cl
752 mov [A2], A1_32
753 %endif
754 EPILOGUE_3_ARGS
755ENDPROC iemAImpl_rorx_u32
756
757 %ifdef RT_ARCH_AMD64
758BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
759 PROLOGUE_3_ARGS
760 %ifdef ASM_CALL64_GCC
761 mov cl, A2_8
762 ror A1, cl
763 mov [A0], A1
764 %else
765 xchg A2, A0
766 ror A1, cl
767 mov [A2], A1
768 %endif
769 EPILOGUE_3_ARGS
770ENDPROC iemAImpl_rorx_u64
771 %endif ; RT_ARCH_AMD64
772
773
774;
775; MULX
776;
777BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
778 PROLOGUE_4_ARGS
779%ifdef ASM_CALL64_GCC
780 ; A2_32 is EDX - prefect
781 mulx T0_32, T1_32, A3_32
782 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], T0_32
784%else
785 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
786 xchg A1, A2
787 mulx T0_32, T1_32, A3_32
788 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
789 mov [A0], T0_32
790%endif
791 EPILOGUE_4_ARGS
792ENDPROC iemAImpl_mulx_u32
793
794
795BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
796 PROLOGUE_4_ARGS
797%ifdef ASM_CALL64_GCC
798 ; A2_32 is EDX, T0_32 is EAX
799 mov eax, A3_32
800 mul A2_32
801 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
802 mov [A0], edx
803%else
804 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
805 xchg A1, A2
806 mov eax, A3_32
807 mul A2_32
808 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
809 mov [A0], edx
810%endif
811 EPILOGUE_4_ARGS
812ENDPROC iemAImpl_mulx_u32_fallback
813
814%ifdef RT_ARCH_AMD64
815BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
816 PROLOGUE_4_ARGS
817%ifdef ASM_CALL64_GCC
818 ; A2 is RDX - prefect
819 mulx T0, T1, A3
820 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], T0
822%else
823 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
824 xchg A1, A2
825 mulx T0, T1, A3
826 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
827 mov [A0], T0
828%endif
829 EPILOGUE_4_ARGS
830ENDPROC iemAImpl_mulx_u64
831
832
833BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
834 PROLOGUE_4_ARGS
835%ifdef ASM_CALL64_GCC
836 ; A2 is RDX, T0 is RAX
837 mov rax, A3
838 mul A2
839 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
840 mov [A0], rdx
841%else
842 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
843 xchg A1, A2
844 mov rax, A3
845 mul A2
846 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
847 mov [A0], rdx
848%endif
849 EPILOGUE_4_ARGS
850ENDPROC iemAImpl_mulx_u64_fallback
851
852%endif
853
854
855;;
856; Macro for implementing a bit operator.
857;
858; This will generate code for the 16, 32 and 64 bit accesses with locked
859; variants, except on 32-bit system where the 64-bit accesses requires hand
860; coding.
861;
862; All the functions takes a pointer to the destination memory operand in A0,
863; the source register operand in A1 and a pointer to eflags in A2.
864;
865; @param 1 The instruction mnemonic.
866; @param 2 Non-zero if there should be a locked version.
867; @param 3 The modified flags.
868; @param 4 The undefined flags.
869;
870%macro IEMIMPL_BIT_OP 4
871BEGINCODE
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 %1 word [A0], A1_16
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u16
879
880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
881 PROLOGUE_3_ARGS
882 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
883 %1 dword [A0], A1_32
884 IEM_SAVE_FLAGS A2, %3, %4
885 EPILOGUE_3_ARGS
886ENDPROC iemAImpl_ %+ %1 %+ _u32
887
888 %ifdef RT_ARCH_AMD64
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 qword [A0], A1
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS_EX 8
895ENDPROC iemAImpl_ %+ %1 %+ _u64
896 %endif ; RT_ARCH_AMD64
897
898 %if %2 != 0 ; locked versions requested?
899
900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
901 PROLOGUE_3_ARGS
902 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
903 lock %1 word [A0], A1_16
904 IEM_SAVE_FLAGS A2, %3, %4
905 EPILOGUE_3_ARGS
906ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
907
908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
909 PROLOGUE_3_ARGS
910 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
911 lock %1 dword [A0], A1_32
912 IEM_SAVE_FLAGS A2, %3, %4
913 EPILOGUE_3_ARGS
914ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
915
916 %ifdef RT_ARCH_AMD64
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 qword [A0], A1
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS_EX 8
923ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
924 %endif ; RT_ARCH_AMD64
925 %endif ; locked
926%endmacro
927IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
928IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
929IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
930IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931
932;;
933; Macro for implementing a bit search operator.
934;
935; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
936; system where the 64-bit accesses requires hand coding.
937;
938; All the functions takes a pointer to the destination memory operand in A0,
939; the source register operand in A1 and a pointer to eflags in A2.
940;
941; In the ZF case the destination register is 'undefined', however it seems that
942; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
943; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
944; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
945; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
946;
947; @param 1 The instruction mnemonic.
948; @param 2 The modified flags.
949; @param 3 The undefined flags.
950; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
951;
952%macro IEMIMPL_BIT_OP2 4
953BEGINCODE
954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
955 PROLOGUE_3_ARGS
956 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_FLAGS A2, %2, %3
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16
966
967BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
968 PROLOGUE_3_ARGS
969 %1 T1_16, A1_16
970%if %4 != 0
971 jz .unchanged_dst
972%endif
973 mov [A0], T1_16
974 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
975 EPILOGUE_3_ARGS
976.unchanged_dst:
977 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
982 PROLOGUE_3_ARGS
983 %1 T0_16, A1_16
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T0_16
988.unchanged_dst:
989 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
990 EPILOGUE_3_ARGS
991ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
992
993
994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
995 PROLOGUE_3_ARGS
996 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_FLAGS A2, %2, %3
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32
1006
1007BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1008 PROLOGUE_3_ARGS
1009 %1 T1_32, A1_32
1010%if %4 != 0
1011 jz .unchanged_dst
1012%endif
1013 mov [A0], T1_32
1014 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1015 EPILOGUE_3_ARGS
1016.unchanged_dst:
1017 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1018 EPILOGUE_3_ARGS
1019ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1020
1021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1022 PROLOGUE_3_ARGS
1023 %1 T0_32, A1_32
1024%if %4 != 0
1025 jz .unchanged_dst
1026%endif
1027 mov [A0], T0_32
1028.unchanged_dst:
1029 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1030 EPILOGUE_3_ARGS
1031ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1032
1033
1034 %ifdef RT_ARCH_AMD64
1035
1036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1037 PROLOGUE_3_ARGS
1038 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1039 %1 T0, A1
1040%if %4 != 0
1041 jz .unchanged_dst
1042%endif
1043 mov [A0], T0
1044.unchanged_dst:
1045 IEM_SAVE_FLAGS A2, %2, %3
1046 EPILOGUE_3_ARGS_EX 8
1047ENDPROC iemAImpl_ %+ %1 %+ _u64
1048
1049BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1050 PROLOGUE_3_ARGS
1051 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1052 %1 T1, A1
1053%if %4 != 0
1054 jz .unchanged_dst
1055%endif
1056 mov [A0], T1
1057 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1058 EPILOGUE_3_ARGS
1059.unchanged_dst:
1060 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1061 EPILOGUE_3_ARGS
1062ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1063
1064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1065 PROLOGUE_3_ARGS
1066 %1 T0, A1
1067%if %4 != 0
1068 jz .unchanged_dst
1069%endif
1070 mov [A0], T0
1071.unchanged_dst:
1072 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1073 EPILOGUE_3_ARGS_EX 8
1074ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1075
1076 %endif ; RT_ARCH_AMD64
1077%endmacro
1078
1079IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1080IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1081IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1082IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1083
1084
1085;;
1086; Macro for implementing POPCNT.
1087;
1088; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1089; system where the 64-bit accesses requires hand coding.
1090;
1091; All the functions takes a pointer to the destination memory operand in A0,
1092; the source register operand in A1 and a pointer to eflags in A2.
1093;
1094; ASSUMES Intel and AMD set EFLAGS the same way.
1095;
1096; ASSUMES the instruction does not support memory destination.
1097;
1098; @param 1 The instruction mnemonic.
1099; @param 2 The modified flags.
1100; @param 3 The undefined flags.
1101;
1102%macro IEMIMPL_BIT_OP3 3
1103BEGINCODE
1104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1105 PROLOGUE_3_ARGS
1106 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1107 %1 T0_16, A1_16
1108 mov [A0], T0_16
1109 IEM_SAVE_FLAGS A2, %2, %3
1110 EPILOGUE_3_ARGS
1111ENDPROC iemAImpl_ %+ %1 %+ _u16
1112
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_32, A1_32
1117 mov [A0], T0_32
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u32
1121
1122 %ifdef RT_ARCH_AMD64
1123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1124 PROLOGUE_3_ARGS
1125 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1126 %1 T0, A1
1127 mov [A0], T0
1128 IEM_SAVE_FLAGS A2, %2, %3
1129 EPILOGUE_3_ARGS_EX 8
1130ENDPROC iemAImpl_ %+ %1 %+ _u64
1131 %endif ; RT_ARCH_AMD64
1132%endmacro
1133IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1134
1135
1136;
1137; IMUL is also a similar but yet different case (no lock, no mem dst).
1138; The rDX:rAX variant of imul is handled together with mul further down.
1139;
1140BEGINCODE
1141; @param 1 EFLAGS that are modified.
1142; @param 2 Undefined EFLAGS.
1143; @param 3 Function suffix.
1144; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1145; 2 for AMD (set AF, clear PF, ZF and SF).
1146%macro IEMIMPL_IMUL_TWO 4
1147BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1148 PROLOGUE_3_ARGS
1149 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1150 imul A1_16, word [A0]
1151 mov [A0], A1_16
1152 %if %4 != 1
1153 IEM_SAVE_FLAGS A2, %1, %2
1154 %else
1155 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1156 %endif
1157 EPILOGUE_3_ARGS
1158ENDPROC iemAImpl_imul_two_u16 %+ %3
1159
1160BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1161 PROLOGUE_3_ARGS
1162 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1163 imul A1_32, dword [A0]
1164 mov [A0], A1_32
1165 %if %4 != 1
1166 IEM_SAVE_FLAGS A2, %1, %2
1167 %else
1168 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1169 %endif
1170 EPILOGUE_3_ARGS
1171ENDPROC iemAImpl_imul_two_u32 %+ %3
1172
1173 %ifdef RT_ARCH_AMD64
1174BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1175 PROLOGUE_3_ARGS
1176 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1177 imul A1, qword [A0]
1178 mov [A0], A1
1179 %if %4 != 1
1180 IEM_SAVE_FLAGS A2, %1, %2
1181 %else
1182 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1183 %endif
1184 EPILOGUE_3_ARGS_EX 8
1185ENDPROC iemAImpl_imul_two_u64 %+ %3
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1189IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1190IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1191
1192
1193;
1194; XCHG for memory operands. This implies locking. No flag changes.
1195;
1196; Each function takes two arguments, first the pointer to the memory,
1197; then the pointer to the register. They all return void.
1198;
1199BEGINCODE
1200BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_8, [A1]
1203 xchg [A0], T0_8
1204 mov [A1], T0_8
1205 EPILOGUE_2_ARGS
1206ENDPROC iemAImpl_xchg_u8_locked
1207
1208BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1209 PROLOGUE_2_ARGS
1210 mov T0_16, [A1]
1211 xchg [A0], T0_16
1212 mov [A1], T0_16
1213 EPILOGUE_2_ARGS
1214ENDPROC iemAImpl_xchg_u16_locked
1215
1216BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1217 PROLOGUE_2_ARGS
1218 mov T0_32, [A1]
1219 xchg [A0], T0_32
1220 mov [A1], T0_32
1221 EPILOGUE_2_ARGS
1222ENDPROC iemAImpl_xchg_u32_locked
1223
1224%ifdef RT_ARCH_AMD64
1225BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0, [A1]
1228 xchg [A0], T0
1229 mov [A1], T0
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u64_locked
1232%endif
1233
1234; Unlocked variants for fDisregardLock mode.
1235
1236BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1237 PROLOGUE_2_ARGS
1238 mov T0_8, [A1]
1239 mov T1_8, [A0]
1240 mov [A0], T0_8
1241 mov [A1], T1_8
1242 EPILOGUE_2_ARGS
1243ENDPROC iemAImpl_xchg_u8_unlocked
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_16, [A1]
1248 mov T1_16, [A0]
1249 mov [A0], T0_16
1250 mov [A1], T1_16
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u16_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_32, [A1]
1257 mov T1_32, [A0]
1258 mov [A0], T0_32
1259 mov [A1], T1_32
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u32_unlocked
1262
1263%ifdef RT_ARCH_AMD64
1264BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1265 PROLOGUE_2_ARGS
1266 mov T0, [A1]
1267 mov T1, [A0]
1268 mov [A0], T0
1269 mov [A1], T1
1270 EPILOGUE_2_ARGS
1271ENDPROC iemAImpl_xchg_u64_unlocked
1272%endif
1273
1274
1275;
1276; XADD for memory operands.
1277;
1278; Each function takes three arguments, first the pointer to the
1279; memory/register, then the pointer to the register, and finally a pointer to
1280; eflags. They all return void.
1281;
1282BEGINCODE
1283BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1284 PROLOGUE_3_ARGS
1285 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1286 mov T0_8, [A1]
1287 xadd [A0], T0_8
1288 mov [A1], T0_8
1289 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1290 EPILOGUE_3_ARGS
1291ENDPROC iemAImpl_xadd_u8
1292
1293BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1294 PROLOGUE_3_ARGS
1295 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1296 mov T0_16, [A1]
1297 xadd [A0], T0_16
1298 mov [A1], T0_16
1299 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1300 EPILOGUE_3_ARGS
1301ENDPROC iemAImpl_xadd_u16
1302
1303BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1304 PROLOGUE_3_ARGS
1305 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 mov T0_32, [A1]
1307 xadd [A0], T0_32
1308 mov [A1], T0_32
1309 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1310 EPILOGUE_3_ARGS
1311ENDPROC iemAImpl_xadd_u32
1312
1313%ifdef RT_ARCH_AMD64
1314BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1315 PROLOGUE_3_ARGS
1316 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1317 mov T0, [A1]
1318 xadd [A0], T0
1319 mov [A1], T0
1320 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1321 EPILOGUE_3_ARGS
1322ENDPROC iemAImpl_xadd_u64
1323%endif ; RT_ARCH_AMD64
1324
1325BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1326 PROLOGUE_3_ARGS
1327 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1328 mov T0_8, [A1]
1329 lock xadd [A0], T0_8
1330 mov [A1], T0_8
1331 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1332 EPILOGUE_3_ARGS
1333ENDPROC iemAImpl_xadd_u8_locked
1334
1335BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1336 PROLOGUE_3_ARGS
1337 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1338 mov T0_16, [A1]
1339 lock xadd [A0], T0_16
1340 mov [A1], T0_16
1341 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1342 EPILOGUE_3_ARGS
1343ENDPROC iemAImpl_xadd_u16_locked
1344
1345BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1346 PROLOGUE_3_ARGS
1347 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1348 mov T0_32, [A1]
1349 lock xadd [A0], T0_32
1350 mov [A1], T0_32
1351 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1352 EPILOGUE_3_ARGS
1353ENDPROC iemAImpl_xadd_u32_locked
1354
1355%ifdef RT_ARCH_AMD64
1356BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1357 PROLOGUE_3_ARGS
1358 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1359 mov T0, [A1]
1360 lock xadd [A0], T0
1361 mov [A1], T0
1362 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_xadd_u64_locked
1365%endif ; RT_ARCH_AMD64
1366
1367
1368;
1369; CMPXCHG8B.
1370;
1371; These are tricky register wise, so the code is duplicated for each calling
1372; convention.
1373;
1374; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1375;
1376; C-proto:
1377; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1378; uint32_t *pEFlags));
1379;
1380; Note! Identical to iemAImpl_cmpxchg16b.
1381;
1382BEGINCODE
1383BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1384%ifdef RT_ARCH_AMD64
1385 %ifdef ASM_CALL64_MSC
1386 push rbx
1387
1388 mov r11, rdx ; pu64EaxEdx (is also T1)
1389 mov r10, rcx ; pu64Dst
1390
1391 mov ebx, [r8]
1392 mov ecx, [r8 + 4]
1393 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1394 mov eax, [r11]
1395 mov edx, [r11 + 4]
1396
1397 cmpxchg8b [r10]
1398
1399 mov [r11], eax
1400 mov [r11 + 4], edx
1401 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1402
1403 pop rbx
1404 ret
1405 %else
1406 push rbx
1407
1408 mov r10, rcx ; pEFlags
1409 mov r11, rdx ; pu64EbxEcx (is also T1)
1410
1411 mov ebx, [r11]
1412 mov ecx, [r11 + 4]
1413 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1414 mov eax, [rsi]
1415 mov edx, [rsi + 4]
1416
1417 cmpxchg8b [rdi]
1418
1419 mov [rsi], eax
1420 mov [rsi + 4], edx
1421 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1422
1423 pop rbx
1424 ret
1425
1426 %endif
1427%else
1428 push esi
1429 push edi
1430 push ebx
1431 push ebp
1432
1433 mov edi, ecx ; pu64Dst
1434 mov esi, edx ; pu64EaxEdx
1435 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1436 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1437
1438 mov ebx, [ecx]
1439 mov ecx, [ecx + 4]
1440 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1441 mov eax, [esi]
1442 mov edx, [esi + 4]
1443
1444 cmpxchg8b [edi]
1445
1446 mov [esi], eax
1447 mov [esi + 4], edx
1448 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1449
1450 pop ebp
1451 pop ebx
1452 pop edi
1453 pop esi
1454 ret 8
1455%endif
1456ENDPROC iemAImpl_cmpxchg8b
1457
1458BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1459%ifdef RT_ARCH_AMD64
1460 %ifdef ASM_CALL64_MSC
1461 push rbx
1462
1463 mov r11, rdx ; pu64EaxEdx (is also T1)
1464 mov r10, rcx ; pu64Dst
1465
1466 mov ebx, [r8]
1467 mov ecx, [r8 + 4]
1468 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1469 mov eax, [r11]
1470 mov edx, [r11 + 4]
1471
1472 lock cmpxchg8b [r10]
1473
1474 mov [r11], eax
1475 mov [r11 + 4], edx
1476 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1477
1478 pop rbx
1479 ret
1480 %else
1481 push rbx
1482
1483 mov r10, rcx ; pEFlags
1484 mov r11, rdx ; pu64EbxEcx (is also T1)
1485
1486 mov ebx, [r11]
1487 mov ecx, [r11 + 4]
1488 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1489 mov eax, [rsi]
1490 mov edx, [rsi + 4]
1491
1492 lock cmpxchg8b [rdi]
1493
1494 mov [rsi], eax
1495 mov [rsi + 4], edx
1496 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1497
1498 pop rbx
1499 ret
1500
1501 %endif
1502%else
1503 push esi
1504 push edi
1505 push ebx
1506 push ebp
1507
1508 mov edi, ecx ; pu64Dst
1509 mov esi, edx ; pu64EaxEdx
1510 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1511 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1512
1513 mov ebx, [ecx]
1514 mov ecx, [ecx + 4]
1515 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1516 mov eax, [esi]
1517 mov edx, [esi + 4]
1518
1519 lock cmpxchg8b [edi]
1520
1521 mov [esi], eax
1522 mov [esi + 4], edx
1523 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1524
1525 pop ebp
1526 pop ebx
1527 pop edi
1528 pop esi
1529 ret 8
1530%endif
1531ENDPROC iemAImpl_cmpxchg8b_locked
1532
1533%ifdef RT_ARCH_AMD64
1534
1535;
1536; CMPXCHG16B.
1537;
1538; These are tricky register wise, so the code is duplicated for each calling
1539; convention.
1540;
1541; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1542;
1543; C-proto:
1544; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1545; uint32_t *pEFlags));
1546;
1547; Note! Identical to iemAImpl_cmpxchg8b.
1548;
1549BEGINCODE
1550BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1551 %ifdef ASM_CALL64_MSC
1552 push rbx
1553
1554 mov r11, rdx ; pu64RaxRdx (is also T1)
1555 mov r10, rcx ; pu64Dst
1556
1557 mov rbx, [r8]
1558 mov rcx, [r8 + 8]
1559 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1560 mov rax, [r11]
1561 mov rdx, [r11 + 8]
1562
1563 cmpxchg16b [r10]
1564
1565 mov [r11], rax
1566 mov [r11 + 8], rdx
1567 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1568
1569 pop rbx
1570 ret
1571 %else
1572 push rbx
1573
1574 mov r10, rcx ; pEFlags
1575 mov r11, rdx ; pu64RbxRcx (is also T1)
1576
1577 mov rbx, [r11]
1578 mov rcx, [r11 + 8]
1579 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1580 mov rax, [rsi]
1581 mov rdx, [rsi + 8]
1582
1583 cmpxchg16b [rdi]
1584
1585 mov [rsi], rax
1586 mov [rsi + 8], rdx
1587 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1588
1589 pop rbx
1590 ret
1591
1592 %endif
1593ENDPROC iemAImpl_cmpxchg16b
1594
1595BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1596 %ifdef ASM_CALL64_MSC
1597 push rbx
1598
1599 mov r11, rdx ; pu64RaxRdx (is also T1)
1600 mov r10, rcx ; pu64Dst
1601
1602 mov rbx, [r8]
1603 mov rcx, [r8 + 8]
1604 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1605 mov rax, [r11]
1606 mov rdx, [r11 + 8]
1607
1608 lock cmpxchg16b [r10]
1609
1610 mov [r11], rax
1611 mov [r11 + 8], rdx
1612 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1613
1614 pop rbx
1615 ret
1616 %else
1617 push rbx
1618
1619 mov r10, rcx ; pEFlags
1620 mov r11, rdx ; pu64RbxRcx (is also T1)
1621
1622 mov rbx, [r11]
1623 mov rcx, [r11 + 8]
1624 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1625 mov rax, [rsi]
1626 mov rdx, [rsi + 8]
1627
1628 lock cmpxchg16b [rdi]
1629
1630 mov [rsi], rax
1631 mov [rsi + 8], rdx
1632 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1633
1634 pop rbx
1635 ret
1636
1637 %endif
1638ENDPROC iemAImpl_cmpxchg16b_locked
1639
1640%endif ; RT_ARCH_AMD64
1641
1642
1643;
1644; CMPXCHG.
1645;
1646; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1647;
1648; C-proto:
1649; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1650;
1651BEGINCODE
1652%macro IEMIMPL_CMPXCHG 2
1653BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1654 PROLOGUE_4_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1656 mov al, [A1]
1657 %1 cmpxchg [A0], A2_8
1658 mov [A1], al
1659 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1660 EPILOGUE_4_ARGS
1661ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1662
1663BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1664 PROLOGUE_4_ARGS
1665 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1666 mov ax, [A1]
1667 %1 cmpxchg [A0], A2_16
1668 mov [A1], ax
1669 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1670 EPILOGUE_4_ARGS
1671ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1672
1673BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1674 PROLOGUE_4_ARGS
1675 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1676 mov eax, [A1]
1677 %1 cmpxchg [A0], A2_32
1678 mov [A1], eax
1679 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1680 EPILOGUE_4_ARGS
1681ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1682
1683BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1684%ifdef RT_ARCH_AMD64
1685 PROLOGUE_4_ARGS
1686 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1687 mov rax, [A1]
1688 %1 cmpxchg [A0], A2
1689 mov [A1], rax
1690 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1691 EPILOGUE_4_ARGS
1692%else
1693 ;
1694 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1695 ;
1696 push esi
1697 push edi
1698 push ebx
1699 push ebp
1700
1701 mov edi, ecx ; pu64Dst
1702 mov esi, edx ; pu64Rax
1703 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1704 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1705
1706 mov ebx, [ecx]
1707 mov ecx, [ecx + 4]
1708 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1709 mov eax, [esi]
1710 mov edx, [esi + 4]
1711
1712 lock cmpxchg8b [edi]
1713
1714 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1715 jz .cmpxchg8b_not_equal
1716 cmp eax, eax ; just set the other flags.
1717.store:
1718 mov [esi], eax
1719 mov [esi + 4], edx
1720 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1721
1722 pop ebp
1723 pop ebx
1724 pop edi
1725 pop esi
1726 ret 8
1727
1728.cmpxchg8b_not_equal:
1729 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1730 jne .store
1731 cmp [esi], eax
1732 jmp .store
1733
1734%endif
1735ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1736%endmacro ; IEMIMPL_CMPXCHG
1737
1738IEMIMPL_CMPXCHG , ,
1739IEMIMPL_CMPXCHG lock, _locked
1740
1741;;
1742; Macro for implementing a unary operator.
1743;
1744; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1745; variants, except on 32-bit system where the 64-bit accesses requires hand
1746; coding.
1747;
1748; All the functions takes a pointer to the destination memory operand in A0,
1749; the source register operand in A1 and a pointer to eflags in A2.
1750;
1751; @param 1 The instruction mnemonic.
1752; @param 2 The modified flags.
1753; @param 3 The undefined flags.
1754;
1755%macro IEMIMPL_UNARY_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1758 PROLOGUE_2_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1760 %1 byte [A0]
1761 IEM_SAVE_FLAGS A1, %2, %3
1762 EPILOGUE_2_ARGS
1763ENDPROC iemAImpl_ %+ %1 %+ _u8
1764
1765BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1766 PROLOGUE_2_ARGS
1767 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1768 lock %1 byte [A0]
1769 IEM_SAVE_FLAGS A1, %2, %3
1770 EPILOGUE_2_ARGS
1771ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1772
1773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1774 PROLOGUE_2_ARGS
1775 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1776 %1 word [A0]
1777 IEM_SAVE_FLAGS A1, %2, %3
1778 EPILOGUE_2_ARGS
1779ENDPROC iemAImpl_ %+ %1 %+ _u16
1780
1781BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1782 PROLOGUE_2_ARGS
1783 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1784 lock %1 word [A0]
1785 IEM_SAVE_FLAGS A1, %2, %3
1786 EPILOGUE_2_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1788
1789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1790 PROLOGUE_2_ARGS
1791 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1792 %1 dword [A0]
1793 IEM_SAVE_FLAGS A1, %2, %3
1794 EPILOGUE_2_ARGS
1795ENDPROC iemAImpl_ %+ %1 %+ _u32
1796
1797BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1798 PROLOGUE_2_ARGS
1799 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1800 lock %1 dword [A0]
1801 IEM_SAVE_FLAGS A1, %2, %3
1802 EPILOGUE_2_ARGS
1803ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1804
1805 %ifdef RT_ARCH_AMD64
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1807 PROLOGUE_2_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1809 %1 qword [A0]
1810 IEM_SAVE_FLAGS A1, %2, %3
1811 EPILOGUE_2_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813
1814BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1815 PROLOGUE_2_ARGS
1816 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1817 lock %1 qword [A0]
1818 IEM_SAVE_FLAGS A1, %2, %3
1819 EPILOGUE_2_ARGS
1820ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1821 %endif ; RT_ARCH_AMD64
1822
1823%endmacro
1824
1825IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1826IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1827IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1828IEMIMPL_UNARY_OP not, 0, 0
1829
1830
1831;
1832; BSWAP. No flag changes.
1833;
1834; Each function takes one argument, pointer to the value to bswap
1835; (input/output). They all return void.
1836;
1837BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1838 PROLOGUE_1_ARGS
1839 mov T0_32, [A0] ; just in case any of the upper bits are used.
1840 db 66h
1841 bswap T0_32
1842 mov [A0], T0_32
1843 EPILOGUE_1_ARGS
1844ENDPROC iemAImpl_bswap_u16
1845
1846BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1847 PROLOGUE_1_ARGS
1848 mov T0_32, [A0]
1849 bswap T0_32
1850 mov [A0], T0_32
1851 EPILOGUE_1_ARGS
1852ENDPROC iemAImpl_bswap_u32
1853
1854BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1855%ifdef RT_ARCH_AMD64
1856 PROLOGUE_1_ARGS
1857 mov T0, [A0]
1858 bswap T0
1859 mov [A0], T0
1860 EPILOGUE_1_ARGS
1861%else
1862 PROLOGUE_1_ARGS
1863 mov T0, [A0]
1864 mov T1, [A0 + 4]
1865 bswap T0
1866 bswap T1
1867 mov [A0 + 4], T0
1868 mov [A0], T1
1869 EPILOGUE_1_ARGS
1870%endif
1871ENDPROC iemAImpl_bswap_u64
1872
1873
1874;;
1875; Macro for implementing a shift operation.
1876;
1877; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1878; 32-bit system where the 64-bit accesses requires hand coding.
1879;
1880; All the functions takes a pointer to the destination memory operand in A0,
1881; the shift count in A1 and a pointer to eflags in A2.
1882;
1883; @param 1 The instruction mnemonic.
1884; @param 2 The modified flags.
1885; @param 3 The undefined flags.
1886;
1887; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1888;
1889; @note the _intel and _amd variants are implemented in C.
1890;
1891%macro IEMIMPL_SHIFT_OP 3
1892BEGINCODE
1893BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1894 PROLOGUE_3_ARGS
1895 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1896 %ifdef ASM_CALL64_GCC
1897 mov cl, A1_8
1898 %1 byte [A0], cl
1899 %else
1900 xchg A1, A0
1901 %1 byte [A1], cl
1902 %endif
1903 IEM_SAVE_FLAGS A2, %2, %3
1904 EPILOGUE_3_ARGS
1905ENDPROC iemAImpl_ %+ %1 %+ _u8
1906
1907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1908 PROLOGUE_3_ARGS
1909 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1910 %ifdef ASM_CALL64_GCC
1911 mov cl, A1_8
1912 %1 word [A0], cl
1913 %else
1914 xchg A1, A0
1915 %1 word [A1], cl
1916 %endif
1917 IEM_SAVE_FLAGS A2, %2, %3
1918 EPILOGUE_3_ARGS
1919ENDPROC iemAImpl_ %+ %1 %+ _u16
1920
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 %ifdef ASM_CALL64_GCC
1925 mov cl, A1_8
1926 %1 dword [A0], cl
1927 %else
1928 xchg A1, A0
1929 %1 dword [A1], cl
1930 %endif
1931 IEM_SAVE_FLAGS A2, %2, %3
1932 EPILOGUE_3_ARGS
1933ENDPROC iemAImpl_ %+ %1 %+ _u32
1934
1935 %ifdef RT_ARCH_AMD64
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1937 PROLOGUE_3_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1939 %ifdef ASM_CALL64_GCC
1940 mov cl, A1_8
1941 %1 qword [A0], cl
1942 %else
1943 xchg A1, A0
1944 %1 qword [A1], cl
1945 %endif
1946 IEM_SAVE_FLAGS A2, %2, %3
1947 EPILOGUE_3_ARGS
1948ENDPROC iemAImpl_ %+ %1 %+ _u64
1949 %endif ; RT_ARCH_AMD64
1950
1951%endmacro
1952
1953IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1954IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1955IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1956IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1957IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1958IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1959IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1960
1961
1962;;
1963; Macro for implementing a double precision shift operation.
1964;
1965; This will generate code for the 16, 32 and 64 bit accesses, except on
1966; 32-bit system where the 64-bit accesses requires hand coding.
1967;
1968; The functions takes the destination operand (r/m) in A0, the source (reg) in
1969; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1970;
1971; @param 1 The instruction mnemonic.
1972; @param 2 The modified flags.
1973; @param 3 The undefined flags.
1974;
1975; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1976;
1977; @note the _intel and _amd variants are implemented in C.
1978;
1979%macro IEMIMPL_SHIFT_DBL_OP 3
1980BEGINCODE
1981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1982 PROLOGUE_4_ARGS
1983 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1984 %ifdef ASM_CALL64_GCC
1985 xchg A3, A2
1986 %1 [A0], A1_16, cl
1987 xchg A3, A2
1988 %else
1989 xchg A0, A2
1990 %1 [A2], A1_16, cl
1991 %endif
1992 IEM_SAVE_FLAGS A3, %2, %3
1993 EPILOGUE_4_ARGS
1994ENDPROC iemAImpl_ %+ %1 %+ _u16
1995
1996BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1997 PROLOGUE_4_ARGS
1998 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1999 %ifdef ASM_CALL64_GCC
2000 xchg A3, A2
2001 %1 [A0], A1_32, cl
2002 xchg A3, A2
2003 %else
2004 xchg A0, A2
2005 %1 [A2], A1_32, cl
2006 %endif
2007 IEM_SAVE_FLAGS A3, %2, %3
2008 EPILOGUE_4_ARGS
2009ENDPROC iemAImpl_ %+ %1 %+ _u32
2010
2011 %ifdef RT_ARCH_AMD64
2012BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2013 PROLOGUE_4_ARGS
2014 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2015 %ifdef ASM_CALL64_GCC
2016 xchg A3, A2
2017 %1 [A0], A1, cl
2018 xchg A3, A2
2019 %else
2020 xchg A0, A2
2021 %1 [A2], A1, cl
2022 %endif
2023 IEM_SAVE_FLAGS A3, %2, %3
2024 EPILOGUE_4_ARGS_EX 12
2025ENDPROC iemAImpl_ %+ %1 %+ _u64
2026 %endif ; RT_ARCH_AMD64
2027
2028%endmacro
2029
2030IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2031IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2032
2033
2034;;
2035; Macro for implementing a multiplication operations.
2036;
2037; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2038; 32-bit system where the 64-bit accesses requires hand coding.
2039;
2040; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2041; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2042; pointer to eflags in A3.
2043;
2044; The functions all return 0 so the caller can be used for div/idiv as well as
2045; for the mul/imul implementation.
2046;
2047; @param 1 The instruction mnemonic.
2048; @param 2 The modified flags.
2049; @param 3 The undefined flags.
2050; @param 4 Name suffix.
2051; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2052;
2053; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2054;
2055%macro IEMIMPL_MUL_OP 5
2056BEGINCODE
2057BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2058 PROLOGUE_3_ARGS
2059 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2060 mov al, [A0]
2061 %1 A1_8
2062 mov [A0], ax
2063 %if %5 != 1
2064 IEM_SAVE_FLAGS A2, %2, %3
2065 %else
2066 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2067 %endif
2068 xor eax, eax
2069 EPILOGUE_3_ARGS
2070ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2071
2072BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2073 PROLOGUE_4_ARGS
2074 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2075 mov ax, [A0]
2076 %ifdef ASM_CALL64_GCC
2077 %1 A2_16
2078 mov [A0], ax
2079 mov [A1], dx
2080 %else
2081 mov T1, A1
2082 %1 A2_16
2083 mov [A0], ax
2084 mov [T1], dx
2085 %endif
2086 %if %5 != 1
2087 IEM_SAVE_FLAGS A3, %2, %3
2088 %else
2089 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2090 %endif
2091 xor eax, eax
2092 EPILOGUE_4_ARGS
2093ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2094
2095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2096 PROLOGUE_4_ARGS
2097 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2098 mov eax, [A0]
2099 %ifdef ASM_CALL64_GCC
2100 %1 A2_32
2101 mov [A0], eax
2102 mov [A1], edx
2103 %else
2104 mov T1, A1
2105 %1 A2_32
2106 mov [A0], eax
2107 mov [T1], edx
2108 %endif
2109 %if %5 != 1
2110 IEM_SAVE_FLAGS A3, %2, %3
2111 %else
2112 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2113 %endif
2114 xor eax, eax
2115 EPILOGUE_4_ARGS
2116ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2117
2118 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2119BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2120 PROLOGUE_4_ARGS
2121 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2122 mov rax, [A0]
2123 %ifdef ASM_CALL64_GCC
2124 %1 A2
2125 mov [A0], rax
2126 mov [A1], rdx
2127 %else
2128 mov T1, A1
2129 %1 A2
2130 mov [A0], rax
2131 mov [T1], rdx
2132 %endif
2133 %if %5 != 1
2134 IEM_SAVE_FLAGS A3, %2, %3
2135 %else
2136 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2137 %endif
2138 xor eax, eax
2139 EPILOGUE_4_ARGS_EX 12
2140ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2141 %endif ; !RT_ARCH_AMD64
2142
2143%endmacro
2144
2145IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2146IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2147IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2148IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2149IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2150IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2151
2152
2153BEGINCODE
2154;;
2155; Worker function for negating a 32-bit number in T1:T0
2156; @uses None (T0,T1)
2157BEGINPROC iemAImpl_negate_T0_T1_u32
2158 push 0
2159 push 0
2160 xchg T0_32, [xSP]
2161 xchg T1_32, [xSP + xCB]
2162 sub T0_32, [xSP]
2163 sbb T1_32, [xSP + xCB]
2164 add xSP, xCB*2
2165 ret
2166ENDPROC iemAImpl_negate_T0_T1_u32
2167
2168%ifdef RT_ARCH_AMD64
2169;;
2170; Worker function for negating a 64-bit number in T1:T0
2171; @uses None (T0,T1)
2172BEGINPROC iemAImpl_negate_T0_T1_u64
2173 push 0
2174 push 0
2175 xchg T0, [xSP]
2176 xchg T1, [xSP + xCB]
2177 sub T0, [xSP]
2178 sbb T1, [xSP + xCB]
2179 add xSP, xCB*2
2180 ret
2181ENDPROC iemAImpl_negate_T0_T1_u64
2182%endif
2183
2184
2185;;
2186; Macro for implementing a division operations.
2187;
2188; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2189; 32-bit system where the 64-bit accesses requires hand coding.
2190;
2191; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2192; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2193; pointer to eflags in A3.
2194;
2195; The functions all return 0 on success and -1 if a divide error should be
2196; raised by the caller.
2197;
2198; @param 1 The instruction mnemonic.
2199; @param 2 The modified flags.
2200; @param 3 The undefined flags.
2201; @param 4 1 if signed, 0 if unsigned.
2202; @param 5 Function suffix.
2203; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2204; 2 for AMD (set AF, clear PF, ZF and SF).
2205;
2206; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2207;
2208%macro IEMIMPL_DIV_OP 6
2209BEGINCODE
2210BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2211 PROLOGUE_3_ARGS
2212
2213 ; div by chainsaw check.
2214 test A1_8, A1_8
2215 jz .div_zero
2216
2217 ; Overflow check - unsigned division is simple to verify, haven't
2218 ; found a simple way to check signed division yet unfortunately.
2219 %if %4 == 0
2220 cmp [A0 + 1], A1_8
2221 jae .div_overflow
2222 %else
2223 mov T0_16, [A0] ; T0 = dividend
2224 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2225 test A1_8, A1_8
2226 js .divisor_negative
2227 test T0_16, T0_16
2228 jns .both_positive
2229 neg T0_16
2230.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2231 push T0 ; Start off like unsigned below.
2232 shr T0_16, 7
2233 cmp T0_8, A1_8
2234 pop T0
2235 jb .div_no_overflow
2236 ja .div_overflow
2237 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2238 cmp T0_8, A1_8
2239 jae .div_overflow
2240 jmp .div_no_overflow
2241
2242.divisor_negative:
2243 neg A1_8
2244 test T0_16, T0_16
2245 jns .one_of_each
2246 neg T0_16
2247.both_positive: ; Same as unsigned shifted by sign indicator bit.
2248 shr T0_16, 7
2249 cmp T0_8, A1_8
2250 jae .div_overflow
2251.div_no_overflow:
2252 mov A1, T1 ; restore divisor
2253 %endif
2254
2255 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2256 mov ax, [A0]
2257 %1 A1_8
2258 mov [A0], ax
2259 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2260 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2261 %else
2262 IEM_SAVE_FLAGS A2, %2, %3
2263 %endif
2264 xor eax, eax
2265
2266.return:
2267 EPILOGUE_3_ARGS
2268
2269.div_zero:
2270.div_overflow:
2271 mov eax, -1
2272 jmp .return
2273ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2274
2275BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2276 PROLOGUE_4_ARGS
2277
2278 ; div by chainsaw check.
2279 test A2_16, A2_16
2280 jz .div_zero
2281
2282 ; Overflow check - unsigned division is simple to verify, haven't
2283 ; found a simple way to check signed division yet unfortunately.
2284 %if %4 == 0
2285 cmp [A1], A2_16
2286 jae .div_overflow
2287 %else
2288 mov T0_16, [A1]
2289 shl T0_32, 16
2290 mov T0_16, [A0] ; T0 = dividend
2291 mov T1, A2 ; T1 = divisor
2292 test T1_16, T1_16
2293 js .divisor_negative
2294 test T0_32, T0_32
2295 jns .both_positive
2296 neg T0_32
2297.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2298 push T0 ; Start off like unsigned below.
2299 shr T0_32, 15
2300 cmp T0_16, T1_16
2301 pop T0
2302 jb .div_no_overflow
2303 ja .div_overflow
2304 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2305 cmp T0_16, T1_16
2306 jae .div_overflow
2307 jmp .div_no_overflow
2308
2309.divisor_negative:
2310 neg T1_16
2311 test T0_32, T0_32
2312 jns .one_of_each
2313 neg T0_32
2314.both_positive: ; Same as unsigned shifted by sign indicator bit.
2315 shr T0_32, 15
2316 cmp T0_16, T1_16
2317 jae .div_overflow
2318.div_no_overflow:
2319 %endif
2320
2321 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2322 %ifdef ASM_CALL64_GCC
2323 mov T1, A2
2324 mov ax, [A0]
2325 mov dx, [A1]
2326 %1 T1_16
2327 mov [A0], ax
2328 mov [A1], dx
2329 %else
2330 mov T1, A1
2331 mov ax, [A0]
2332 mov dx, [T1]
2333 %1 A2_16
2334 mov [A0], ax
2335 mov [T1], dx
2336 %endif
2337 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2338 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2339 %else
2340 IEM_SAVE_FLAGS A3, %2, %3
2341 %endif
2342 xor eax, eax
2343
2344.return:
2345 EPILOGUE_4_ARGS
2346
2347.div_zero:
2348.div_overflow:
2349 mov eax, -1
2350 jmp .return
2351ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2352
2353BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2354 PROLOGUE_4_ARGS
2355
2356 ; div by chainsaw check.
2357 test A2_32, A2_32
2358 jz .div_zero
2359
2360 ; Overflow check - unsigned division is simple to verify, haven't
2361 ; found a simple way to check signed division yet unfortunately.
2362 %if %4 == 0
2363 cmp [A1], A2_32
2364 jae .div_overflow
2365 %else
2366 push A2 ; save A2 so we modify it (we out of regs on x86).
2367 mov T0_32, [A0] ; T0 = dividend low
2368 mov T1_32, [A1] ; T1 = dividend high
2369 test A2_32, A2_32
2370 js .divisor_negative
2371 test T1_32, T1_32
2372 jns .both_positive
2373 call NAME(iemAImpl_negate_T0_T1_u32)
2374.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2375 push T0 ; Start off like unsigned below.
2376 shl T1_32, 1
2377 shr T0_32, 31
2378 or T1_32, T0_32
2379 cmp T1_32, A2_32
2380 pop T0
2381 jb .div_no_overflow
2382 ja .div_overflow
2383 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2384 cmp T0_32, A2_32
2385 jae .div_overflow
2386 jmp .div_no_overflow
2387
2388.divisor_negative:
2389 neg A2_32
2390 test T1_32, T1_32
2391 jns .one_of_each
2392 call NAME(iemAImpl_negate_T0_T1_u32)
2393.both_positive: ; Same as unsigned shifted by sign indicator bit.
2394 shl T1_32, 1
2395 shr T0_32, 31
2396 or T1_32, T0_32
2397 cmp T1_32, A2_32
2398 jae .div_overflow
2399.div_no_overflow:
2400 pop A2
2401 %endif
2402
2403 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2404 mov eax, [A0]
2405 %ifdef ASM_CALL64_GCC
2406 mov T1, A2
2407 mov eax, [A0]
2408 mov edx, [A1]
2409 %1 T1_32
2410 mov [A0], eax
2411 mov [A1], edx
2412 %else
2413 mov T1, A1
2414 mov eax, [A0]
2415 mov edx, [T1]
2416 %1 A2_32
2417 mov [A0], eax
2418 mov [T1], edx
2419 %endif
2420 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2421 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2422 %else
2423 IEM_SAVE_FLAGS A3, %2, %3
2424 %endif
2425 xor eax, eax
2426
2427.return:
2428 EPILOGUE_4_ARGS
2429
2430.div_overflow:
2431 %if %4 != 0
2432 pop A2
2433 %endif
2434.div_zero:
2435 mov eax, -1
2436 jmp .return
2437ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2438
2439 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2440BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2441 PROLOGUE_4_ARGS
2442
2443 test A2, A2
2444 jz .div_zero
2445 %if %4 == 0
2446 cmp [A1], A2
2447 jae .div_overflow
2448 %else
2449 push A2 ; save A2 so we modify it (we out of regs on x86).
2450 mov T0, [A0] ; T0 = dividend low
2451 mov T1, [A1] ; T1 = dividend high
2452 test A2, A2
2453 js .divisor_negative
2454 test T1, T1
2455 jns .both_positive
2456 call NAME(iemAImpl_negate_T0_T1_u64)
2457.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2458 push T0 ; Start off like unsigned below.
2459 shl T1, 1
2460 shr T0, 63
2461 or T1, T0
2462 cmp T1, A2
2463 pop T0
2464 jb .div_no_overflow
2465 ja .div_overflow
2466 mov T1, 0x7fffffffffffffff
2467 and T0, T1 ; Special case for covering (divisor - 1).
2468 cmp T0, A2
2469 jae .div_overflow
2470 jmp .div_no_overflow
2471
2472.divisor_negative:
2473 neg A2
2474 test T1, T1
2475 jns .one_of_each
2476 call NAME(iemAImpl_negate_T0_T1_u64)
2477.both_positive: ; Same as unsigned shifted by sign indicator bit.
2478 shl T1, 1
2479 shr T0, 63
2480 or T1, T0
2481 cmp T1, A2
2482 jae .div_overflow
2483.div_no_overflow:
2484 pop A2
2485 %endif
2486
2487 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2488 mov rax, [A0]
2489 %ifdef ASM_CALL64_GCC
2490 mov T1, A2
2491 mov rax, [A0]
2492 mov rdx, [A1]
2493 %1 T1
2494 mov [A0], rax
2495 mov [A1], rdx
2496 %else
2497 mov T1, A1
2498 mov rax, [A0]
2499 mov rdx, [T1]
2500 %1 A2
2501 mov [A0], rax
2502 mov [T1], rdx
2503 %endif
2504 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2505 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2506 %else
2507 IEM_SAVE_FLAGS A3, %2, %3
2508 %endif
2509 xor eax, eax
2510
2511.return:
2512 EPILOGUE_4_ARGS_EX 12
2513
2514.div_overflow:
2515 %if %4 != 0
2516 pop A2
2517 %endif
2518.div_zero:
2519 mov eax, -1
2520 jmp .return
2521ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2522 %endif ; !RT_ARCH_AMD64
2523
2524%endmacro
2525
2526IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2527IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2528IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2529IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2530IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2531IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2532
2533
2534;;
2535; Macro for implementing memory fence operation.
2536;
2537; No return value, no operands or anything.
2538;
2539; @param 1 The instruction.
2540;
2541%macro IEMIMPL_MEM_FENCE 1
2542BEGINCODE
2543BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2544 %1
2545 ret
2546ENDPROC iemAImpl_ %+ %1
2547%endmacro
2548
2549IEMIMPL_MEM_FENCE lfence
2550IEMIMPL_MEM_FENCE sfence
2551IEMIMPL_MEM_FENCE mfence
2552
2553;;
2554; Alternative for non-SSE2 host.
2555;
2556BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2557 push xAX
2558 xchg xAX, [xSP]
2559 add xSP, xCB
2560 ret
2561ENDPROC iemAImpl_alt_mem_fence
2562
2563
2564;;
2565; Initialize the FPU for the actual instruction being emulated, this means
2566; loading parts of the guest's control word and status word.
2567;
2568; @uses 24 bytes of stack. T0, T1
2569; @param 1 Expression giving the address of the FXSTATE of the guest.
2570;
2571%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2572 fnstenv [xSP]
2573
2574 ; FCW - for exception, precision and rounding control.
2575 movzx T0, word [%1 + X86FXSTATE.FCW]
2576 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2577 mov [xSP + X86FSTENV32P.FCW], T0_16
2578
2579 ; FSW - for undefined C0, C1, C2, and C3.
2580 movzx T1, word [%1 + X86FXSTATE.FSW]
2581 and T1, X86_FSW_C_MASK
2582 movzx T0, word [xSP + X86FSTENV32P.FSW]
2583 and T0, X86_FSW_TOP_MASK
2584 or T0, T1
2585 mov [xSP + X86FSTENV32P.FSW], T0_16
2586
2587 fldenv [xSP]
2588%endmacro
2589
2590
2591;;
2592; Initialize the FPU for the actual instruction being emulated, this means
2593; loading parts of the guest's control word, status word, and update the
2594; tag word for the top register if it's empty.
2595;
2596; ASSUMES actual TOP=7
2597;
2598; @uses 24 bytes of stack. T0, T1
2599; @param 1 Expression giving the address of the FXSTATE of the guest.
2600;
2601%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2602 fnstenv [xSP]
2603
2604 ; FCW - for exception, precision and rounding control.
2605 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2606 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2607 mov [xSP + X86FSTENV32P.FCW], T0_16
2608
2609 ; FSW - for undefined C0, C1, C2, and C3.
2610 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2611 and T1_32, X86_FSW_C_MASK
2612 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2613 and T0_32, X86_FSW_TOP_MASK
2614 or T0_32, T1_32
2615 mov [xSP + X86FSTENV32P.FSW], T0_16
2616
2617 ; FTW - Only for ST0 (in/out).
2618 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2619 shr T1_32, X86_FSW_TOP_SHIFT
2620 and T1_32, X86_FSW_TOP_SMASK
2621 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2622 jc %%st0_not_empty
2623 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2624%%st0_not_empty:
2625
2626 fldenv [xSP]
2627%endmacro
2628
2629
2630;;
2631; Need to move this as well somewhere better?
2632;
2633struc IEMFPURESULT
2634 .r80Result resw 5
2635 .FSW resw 1
2636endstruc
2637
2638
2639;;
2640; Need to move this as well somewhere better?
2641;
2642struc IEMFPURESULTTWO
2643 .r80Result1 resw 5
2644 .FSW resw 1
2645 .r80Result2 resw 5
2646endstruc
2647
2648
2649;
2650;---------------------- 16-bit signed integer operations ----------------------
2651;
2652
2653
2654;;
2655; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2656;
2657; @param A0 FPU context (fxsave).
2658; @param A1 Pointer to a IEMFPURESULT for the output.
2659; @param A2 Pointer to the 16-bit floating point value to convert.
2660;
2661BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2662 PROLOGUE_3_ARGS
2663 sub xSP, 20h
2664
2665 fninit
2666 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2667 fild word [A2]
2668
2669 fnstsw word [A1 + IEMFPURESULT.FSW]
2670 fnclex
2671 fstp tword [A1 + IEMFPURESULT.r80Result]
2672
2673 fninit
2674 add xSP, 20h
2675 EPILOGUE_3_ARGS
2676ENDPROC iemAImpl_fild_r80_from_i16
2677
2678
2679;;
2680; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2681;
2682; @param A0 FPU context (fxsave).
2683; @param A1 Where to return the output FSW.
2684; @param A2 Where to store the 16-bit signed integer value.
2685; @param A3 Pointer to the 80-bit value.
2686;
2687BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2688 PROLOGUE_4_ARGS
2689 sub xSP, 20h
2690
2691 fninit
2692 fld tword [A3]
2693 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2694 fistp word [A2]
2695
2696 fnstsw word [A1]
2697
2698 fninit
2699 add xSP, 20h
2700 EPILOGUE_4_ARGS
2701ENDPROC iemAImpl_fist_r80_to_i16
2702
2703
2704;;
2705; Store a 80-bit floating point value (register) as a 16-bit signed integer
2706; (memory) with truncation.
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Where to return the output FSW.
2710; @param A2 Where to store the 16-bit signed integer value.
2711; @param A3 Pointer to the 80-bit value.
2712;
2713BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2714 PROLOGUE_4_ARGS
2715 sub xSP, 20h
2716
2717 fninit
2718 fld tword [A3]
2719 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2720 fisttp word [A2]
2721
2722 fnstsw word [A1]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_4_ARGS
2727ENDPROC iemAImpl_fistt_r80_to_i16
2728
2729
2730;;
2731; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2732;
2733; @param 1 The instruction
2734;
2735; @param A0 FPU context (fxsave).
2736; @param A1 Pointer to a IEMFPURESULT for the output.
2737; @param A2 Pointer to the 80-bit value.
2738; @param A3 Pointer to the 16-bit value.
2739;
2740%macro IEMIMPL_FPU_R80_BY_I16 1
2741BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2742 PROLOGUE_4_ARGS
2743 sub xSP, 20h
2744
2745 fninit
2746 fld tword [A2]
2747 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2748 %1 word [A3]
2749
2750 fnstsw word [A1 + IEMFPURESULT.FSW]
2751 fnclex
2752 fstp tword [A1 + IEMFPURESULT.r80Result]
2753
2754 fninit
2755 add xSP, 20h
2756 EPILOGUE_4_ARGS
2757ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2758%endmacro
2759
2760IEMIMPL_FPU_R80_BY_I16 fiadd
2761IEMIMPL_FPU_R80_BY_I16 fimul
2762IEMIMPL_FPU_R80_BY_I16 fisub
2763IEMIMPL_FPU_R80_BY_I16 fisubr
2764IEMIMPL_FPU_R80_BY_I16 fidiv
2765IEMIMPL_FPU_R80_BY_I16 fidivr
2766
2767
2768;;
2769; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2770; only returning FSW.
2771;
2772; @param 1 The instruction
2773;
2774; @param A0 FPU context (fxsave).
2775; @param A1 Where to store the output FSW.
2776; @param A2 Pointer to the 80-bit value.
2777; @param A3 Pointer to the 64-bit value.
2778;
2779%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2780BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2781 PROLOGUE_4_ARGS
2782 sub xSP, 20h
2783
2784 fninit
2785 fld tword [A2]
2786 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2787 %1 word [A3]
2788
2789 fnstsw word [A1]
2790
2791 fninit
2792 add xSP, 20h
2793 EPILOGUE_4_ARGS
2794ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2795%endmacro
2796
2797IEMIMPL_FPU_R80_BY_I16_FSW ficom
2798
2799
2800
2801;
2802;---------------------- 32-bit signed integer operations ----------------------
2803;
2804
2805
2806;;
2807; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2808;
2809; @param A0 FPU context (fxsave).
2810; @param A1 Pointer to a IEMFPURESULT for the output.
2811; @param A2 Pointer to the 32-bit floating point value to convert.
2812;
2813BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2814 PROLOGUE_3_ARGS
2815 sub xSP, 20h
2816
2817 fninit
2818 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2819 fild dword [A2]
2820
2821 fnstsw word [A1 + IEMFPURESULT.FSW]
2822 fnclex
2823 fstp tword [A1 + IEMFPURESULT.r80Result]
2824
2825 fninit
2826 add xSP, 20h
2827 EPILOGUE_3_ARGS
2828ENDPROC iemAImpl_fild_r80_from_i32
2829
2830
2831;;
2832; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2833;
2834; @param A0 FPU context (fxsave).
2835; @param A1 Where to return the output FSW.
2836; @param A2 Where to store the 32-bit signed integer value.
2837; @param A3 Pointer to the 80-bit value.
2838;
2839BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2840 PROLOGUE_4_ARGS
2841 sub xSP, 20h
2842
2843 fninit
2844 fld tword [A3]
2845 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2846 fistp dword [A2]
2847
2848 fnstsw word [A1]
2849
2850 fninit
2851 add xSP, 20h
2852 EPILOGUE_4_ARGS
2853ENDPROC iemAImpl_fist_r80_to_i32
2854
2855
2856;;
2857; Store a 80-bit floating point value (register) as a 32-bit signed integer
2858; (memory) with truncation.
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Where to return the output FSW.
2862; @param A2 Where to store the 32-bit signed integer value.
2863; @param A3 Pointer to the 80-bit value.
2864;
2865BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2866 PROLOGUE_4_ARGS
2867 sub xSP, 20h
2868
2869 fninit
2870 fld tword [A3]
2871 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2872 fisttp dword [A2]
2873
2874 fnstsw word [A1]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_4_ARGS
2879ENDPROC iemAImpl_fistt_r80_to_i32
2880
2881
2882;;
2883; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2884;
2885; @param 1 The instruction
2886;
2887; @param A0 FPU context (fxsave).
2888; @param A1 Pointer to a IEMFPURESULT for the output.
2889; @param A2 Pointer to the 80-bit value.
2890; @param A3 Pointer to the 32-bit value.
2891;
2892%macro IEMIMPL_FPU_R80_BY_I32 1
2893BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2894 PROLOGUE_4_ARGS
2895 sub xSP, 20h
2896
2897 fninit
2898 fld tword [A2]
2899 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2900 %1 dword [A3]
2901
2902 fnstsw word [A1 + IEMFPURESULT.FSW]
2903 fnclex
2904 fstp tword [A1 + IEMFPURESULT.r80Result]
2905
2906 fninit
2907 add xSP, 20h
2908 EPILOGUE_4_ARGS
2909ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2910%endmacro
2911
2912IEMIMPL_FPU_R80_BY_I32 fiadd
2913IEMIMPL_FPU_R80_BY_I32 fimul
2914IEMIMPL_FPU_R80_BY_I32 fisub
2915IEMIMPL_FPU_R80_BY_I32 fisubr
2916IEMIMPL_FPU_R80_BY_I32 fidiv
2917IEMIMPL_FPU_R80_BY_I32 fidivr
2918
2919
2920;;
2921; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2922; only returning FSW.
2923;
2924; @param 1 The instruction
2925;
2926; @param A0 FPU context (fxsave).
2927; @param A1 Where to store the output FSW.
2928; @param A2 Pointer to the 80-bit value.
2929; @param A3 Pointer to the 64-bit value.
2930;
2931%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2932BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2933 PROLOGUE_4_ARGS
2934 sub xSP, 20h
2935
2936 fninit
2937 fld tword [A2]
2938 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2939 %1 dword [A3]
2940
2941 fnstsw word [A1]
2942
2943 fninit
2944 add xSP, 20h
2945 EPILOGUE_4_ARGS
2946ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2947%endmacro
2948
2949IEMIMPL_FPU_R80_BY_I32_FSW ficom
2950
2951
2952
2953;
2954;---------------------- 64-bit signed integer operations ----------------------
2955;
2956
2957
2958;;
2959; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2960;
2961; @param A0 FPU context (fxsave).
2962; @param A1 Pointer to a IEMFPURESULT for the output.
2963; @param A2 Pointer to the 64-bit floating point value to convert.
2964;
2965BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2966 PROLOGUE_3_ARGS
2967 sub xSP, 20h
2968
2969 fninit
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 fild qword [A2]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_3_ARGS
2980ENDPROC iemAImpl_fild_r80_from_i64
2981
2982
2983;;
2984; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2985;
2986; @param A0 FPU context (fxsave).
2987; @param A1 Where to return the output FSW.
2988; @param A2 Where to store the 64-bit signed integer value.
2989; @param A3 Pointer to the 80-bit value.
2990;
2991BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2992 PROLOGUE_4_ARGS
2993 sub xSP, 20h
2994
2995 fninit
2996 fld tword [A3]
2997 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2998 fistp qword [A2]
2999
3000 fnstsw word [A1]
3001
3002 fninit
3003 add xSP, 20h
3004 EPILOGUE_4_ARGS
3005ENDPROC iemAImpl_fist_r80_to_i64
3006
3007
3008;;
3009; Store a 80-bit floating point value (register) as a 64-bit signed integer
3010; (memory) with truncation.
3011;
3012; @param A0 FPU context (fxsave).
3013; @param A1 Where to return the output FSW.
3014; @param A2 Where to store the 64-bit signed integer value.
3015; @param A3 Pointer to the 80-bit value.
3016;
3017BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3018 PROLOGUE_4_ARGS
3019 sub xSP, 20h
3020
3021 fninit
3022 fld tword [A3]
3023 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3024 fisttp qword [A2]
3025
3026 fnstsw word [A1]
3027
3028 fninit
3029 add xSP, 20h
3030 EPILOGUE_4_ARGS
3031ENDPROC iemAImpl_fistt_r80_to_i64
3032
3033
3034
3035;
3036;---------------------- 32-bit floating point operations ----------------------
3037;
3038
3039;;
3040; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3041;
3042; @param A0 FPU context (fxsave).
3043; @param A1 Pointer to a IEMFPURESULT for the output.
3044; @param A2 Pointer to the 32-bit floating point value to convert.
3045;
3046BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3047 PROLOGUE_3_ARGS
3048 sub xSP, 20h
3049
3050 fninit
3051 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3052 fld dword [A2]
3053
3054 fnstsw word [A1 + IEMFPURESULT.FSW]
3055 fnclex
3056 fstp tword [A1 + IEMFPURESULT.r80Result]
3057
3058 fninit
3059 add xSP, 20h
3060 EPILOGUE_3_ARGS
3061ENDPROC iemAImpl_fld_r80_from_r32
3062
3063
3064;;
3065; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3066;
3067; @param A0 FPU context (fxsave).
3068; @param A1 Where to return the output FSW.
3069; @param A2 Where to store the 32-bit value.
3070; @param A3 Pointer to the 80-bit value.
3071;
3072BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3073 PROLOGUE_4_ARGS
3074 sub xSP, 20h
3075
3076 fninit
3077 fld tword [A3]
3078 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3079 fst dword [A2]
3080
3081 fnstsw word [A1]
3082
3083 fninit
3084 add xSP, 20h
3085 EPILOGUE_4_ARGS
3086ENDPROC iemAImpl_fst_r80_to_r32
3087
3088
3089;;
3090; FPU instruction working on one 80-bit and one 32-bit floating point value.
3091;
3092; @param 1 The instruction
3093;
3094; @param A0 FPU context (fxsave).
3095; @param A1 Pointer to a IEMFPURESULT for the output.
3096; @param A2 Pointer to the 80-bit value.
3097; @param A3 Pointer to the 32-bit value.
3098;
3099%macro IEMIMPL_FPU_R80_BY_R32 1
3100BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3101 PROLOGUE_4_ARGS
3102 sub xSP, 20h
3103
3104 fninit
3105 fld tword [A2]
3106 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3107 %1 dword [A3]
3108
3109 fnstsw word [A1 + IEMFPURESULT.FSW]
3110 fnclex
3111 fstp tword [A1 + IEMFPURESULT.r80Result]
3112
3113 fninit
3114 add xSP, 20h
3115 EPILOGUE_4_ARGS
3116ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3117%endmacro
3118
3119IEMIMPL_FPU_R80_BY_R32 fadd
3120IEMIMPL_FPU_R80_BY_R32 fmul
3121IEMIMPL_FPU_R80_BY_R32 fsub
3122IEMIMPL_FPU_R80_BY_R32 fsubr
3123IEMIMPL_FPU_R80_BY_R32 fdiv
3124IEMIMPL_FPU_R80_BY_R32 fdivr
3125
3126
3127;;
3128; FPU instruction working on one 80-bit and one 32-bit floating point value,
3129; only returning FSW.
3130;
3131; @param 1 The instruction
3132;
3133; @param A0 FPU context (fxsave).
3134; @param A1 Where to store the output FSW.
3135; @param A2 Pointer to the 80-bit value.
3136; @param A3 Pointer to the 64-bit value.
3137;
3138%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3140 PROLOGUE_4_ARGS
3141 sub xSP, 20h
3142
3143 fninit
3144 fld tword [A2]
3145 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3146 %1 dword [A3]
3147
3148 fnstsw word [A1]
3149
3150 fninit
3151 add xSP, 20h
3152 EPILOGUE_4_ARGS
3153ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3154%endmacro
3155
3156IEMIMPL_FPU_R80_BY_R32_FSW fcom
3157
3158
3159
3160;
3161;---------------------- 64-bit floating point operations ----------------------
3162;
3163
3164;;
3165; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3166;
3167; @param A0 FPU context (fxsave).
3168; @param A1 Pointer to a IEMFPURESULT for the output.
3169; @param A2 Pointer to the 64-bit floating point value to convert.
3170;
3171BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3172 PROLOGUE_3_ARGS
3173 sub xSP, 20h
3174
3175 fninit
3176 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3177 fld qword [A2]
3178
3179 fnstsw word [A1 + IEMFPURESULT.FSW]
3180 fnclex
3181 fstp tword [A1 + IEMFPURESULT.r80Result]
3182
3183 fninit
3184 add xSP, 20h
3185 EPILOGUE_3_ARGS
3186ENDPROC iemAImpl_fld_r80_from_r64
3187
3188
3189;;
3190; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3191;
3192; @param A0 FPU context (fxsave).
3193; @param A1 Where to return the output FSW.
3194; @param A2 Where to store the 64-bit value.
3195; @param A3 Pointer to the 80-bit value.
3196;
3197BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3198 PROLOGUE_4_ARGS
3199 sub xSP, 20h
3200
3201 fninit
3202 fld tword [A3]
3203 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3204 fst qword [A2]
3205
3206 fnstsw word [A1]
3207
3208 fninit
3209 add xSP, 20h
3210 EPILOGUE_4_ARGS
3211ENDPROC iemAImpl_fst_r80_to_r64
3212
3213
3214;;
3215; FPU instruction working on one 80-bit and one 64-bit floating point value.
3216;
3217; @param 1 The instruction
3218;
3219; @param A0 FPU context (fxsave).
3220; @param A1 Pointer to a IEMFPURESULT for the output.
3221; @param A2 Pointer to the 80-bit value.
3222; @param A3 Pointer to the 64-bit value.
3223;
3224%macro IEMIMPL_FPU_R80_BY_R64 1
3225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3226 PROLOGUE_4_ARGS
3227 sub xSP, 20h
3228
3229 fninit
3230 fld tword [A2]
3231 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3232 %1 qword [A3]
3233
3234 fnstsw word [A1 + IEMFPURESULT.FSW]
3235 fnclex
3236 fstp tword [A1 + IEMFPURESULT.r80Result]
3237
3238 fninit
3239 add xSP, 20h
3240 EPILOGUE_4_ARGS
3241ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3242%endmacro
3243
3244IEMIMPL_FPU_R80_BY_R64 fadd
3245IEMIMPL_FPU_R80_BY_R64 fmul
3246IEMIMPL_FPU_R80_BY_R64 fsub
3247IEMIMPL_FPU_R80_BY_R64 fsubr
3248IEMIMPL_FPU_R80_BY_R64 fdiv
3249IEMIMPL_FPU_R80_BY_R64 fdivr
3250
3251;;
3252; FPU instruction working on one 80-bit and one 64-bit floating point value,
3253; only returning FSW.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Where to store the output FSW.
3259; @param A2 Pointer to the 80-bit value.
3260; @param A3 Pointer to the 64-bit value.
3261;
3262%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A2]
3269 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3270 %1 qword [A3]
3271
3272 fnstsw word [A1]
3273
3274 fninit
3275 add xSP, 20h
3276 EPILOGUE_4_ARGS
3277ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3278%endmacro
3279
3280IEMIMPL_FPU_R80_BY_R64_FSW fcom
3281
3282
3283
3284;
3285;---------------------- 80-bit floating point operations ----------------------
3286;
3287
3288;;
3289; Loads a 80-bit floating point register value from memory.
3290;
3291; @param A0 FPU context (fxsave).
3292; @param A1 Pointer to a IEMFPURESULT for the output.
3293; @param A2 Pointer to the 80-bit floating point value to load.
3294;
3295BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3296 PROLOGUE_3_ARGS
3297 sub xSP, 20h
3298
3299 fninit
3300 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3301 fld tword [A2]
3302
3303 fnstsw word [A1 + IEMFPURESULT.FSW]
3304 fnclex
3305 fstp tword [A1 + IEMFPURESULT.r80Result]
3306
3307 fninit
3308 add xSP, 20h
3309 EPILOGUE_3_ARGS
3310ENDPROC iemAImpl_fld_r80_from_r80
3311
3312
3313;;
3314; Store a 80-bit floating point register to memory
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Where to return the output FSW.
3318; @param A2 Where to store the 80-bit value.
3319; @param A3 Pointer to the 80-bit register value.
3320;
3321BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3322 PROLOGUE_4_ARGS
3323 sub xSP, 20h
3324
3325 fninit
3326 fld tword [A3]
3327 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3328 fstp tword [A2]
3329
3330 fnstsw word [A1]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_4_ARGS
3335ENDPROC iemAImpl_fst_r80_to_r80
3336
3337
3338;;
3339; Loads an 80-bit floating point register value in BCD format from memory.
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Pointer to a IEMFPURESULT for the output.
3343; @param A2 Pointer to the 80-bit BCD value to load.
3344;
3345BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3346 PROLOGUE_3_ARGS
3347 sub xSP, 20h
3348
3349 fninit
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 fbld tword [A2]
3352
3353 fnstsw word [A1 + IEMFPURESULT.FSW]
3354 fnclex
3355 fstp tword [A1 + IEMFPURESULT.r80Result]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_3_ARGS
3360ENDPROC iemAImpl_fld_r80_from_d80
3361
3362
3363;;
3364; Store a 80-bit floating point register to memory as BCD
3365;
3366; @param A0 FPU context (fxsave).
3367; @param A1 Where to return the output FSW.
3368; @param A2 Where to store the 80-bit BCD value.
3369; @param A3 Pointer to the 80-bit register value.
3370;
3371BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3372 PROLOGUE_4_ARGS
3373 sub xSP, 20h
3374
3375 fninit
3376 fld tword [A3]
3377 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3378 fbstp tword [A2]
3379
3380 fnstsw word [A1]
3381
3382 fninit
3383 add xSP, 20h
3384 EPILOGUE_4_ARGS
3385ENDPROC iemAImpl_fst_r80_to_d80
3386
3387
3388;;
3389; FPU instruction working on two 80-bit floating point values.
3390;
3391; @param 1 The instruction
3392;
3393; @param A0 FPU context (fxsave).
3394; @param A1 Pointer to a IEMFPURESULT for the output.
3395; @param A2 Pointer to the first 80-bit value (ST0)
3396; @param A3 Pointer to the second 80-bit value (STn).
3397;
3398%macro IEMIMPL_FPU_R80_BY_R80 2
3399BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3400 PROLOGUE_4_ARGS
3401 sub xSP, 20h
3402
3403 fninit
3404 fld tword [A3]
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 %2
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3420IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3421IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3422IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3423IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3424IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3425IEMIMPL_FPU_R80_BY_R80 fprem, {}
3426IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3427IEMIMPL_FPU_R80_BY_R80 fscale, {}
3428
3429
3430;;
3431; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3432; storing the result in ST1 and popping the stack.
3433;
3434; @param 1 The instruction
3435;
3436; @param A0 FPU context (fxsave).
3437; @param A1 Pointer to a IEMFPURESULT for the output.
3438; @param A2 Pointer to the first 80-bit value (ST1).
3439; @param A3 Pointer to the second 80-bit value (ST0).
3440;
3441%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3442BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3443 PROLOGUE_4_ARGS
3444 sub xSP, 20h
3445
3446 fninit
3447 fld tword [A2]
3448 fld tword [A3]
3449 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3450 %1
3451
3452 fnstsw word [A1 + IEMFPURESULT.FSW]
3453 fnclex
3454 fstp tword [A1 + IEMFPURESULT.r80Result]
3455
3456 fninit
3457 add xSP, 20h
3458 EPILOGUE_4_ARGS
3459ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3460%endmacro
3461
3462IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3463IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3464IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3465
3466
3467;;
3468; FPU instruction working on two 80-bit floating point values, only
3469; returning FSW.
3470;
3471; @param 1 The instruction
3472;
3473; @param A0 FPU context (fxsave).
3474; @param A1 Pointer to a uint16_t for the resulting FSW.
3475; @param A2 Pointer to the first 80-bit value.
3476; @param A3 Pointer to the second 80-bit value.
3477;
3478%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3480 PROLOGUE_4_ARGS
3481 sub xSP, 20h
3482
3483 fninit
3484 fld tword [A3]
3485 fld tword [A2]
3486 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3487 %1 st0, st1
3488
3489 fnstsw word [A1]
3490
3491 fninit
3492 add xSP, 20h
3493 EPILOGUE_4_ARGS
3494ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3495%endmacro
3496
3497IEMIMPL_FPU_R80_BY_R80_FSW fcom
3498IEMIMPL_FPU_R80_BY_R80_FSW fucom
3499
3500
3501;;
3502; FPU instruction working on two 80-bit floating point values,
3503; returning FSW and EFLAGS (eax).
3504;
3505; @param 1 The instruction
3506;
3507; @returns EFLAGS in EAX.
3508; @param A0 FPU context (fxsave).
3509; @param A1 Pointer to a uint16_t for the resulting FSW.
3510; @param A2 Pointer to the first 80-bit value.
3511; @param A3 Pointer to the second 80-bit value.
3512;
3513%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3515 PROLOGUE_4_ARGS
3516 sub xSP, 20h
3517
3518 fninit
3519 fld tword [A3]
3520 fld tword [A2]
3521 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3522 %1 st1
3523
3524 fnstsw word [A1]
3525 pushf
3526 pop xAX
3527
3528 fninit
3529 add xSP, 20h
3530 EPILOGUE_4_ARGS
3531ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3532%endmacro
3533
3534IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3535IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3536
3537
3538;;
3539; FPU instruction working on one 80-bit floating point value.
3540;
3541; @param 1 The instruction
3542;
3543; @param A0 FPU context (fxsave).
3544; @param A1 Pointer to a IEMFPURESULT for the output.
3545; @param A2 Pointer to the 80-bit value.
3546;
3547%macro IEMIMPL_FPU_R80 1
3548BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3549 PROLOGUE_3_ARGS
3550 sub xSP, 20h
3551
3552 fninit
3553 fld tword [A2]
3554 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3555 %1
3556
3557 fnstsw word [A1 + IEMFPURESULT.FSW]
3558 fnclex
3559 fstp tword [A1 + IEMFPURESULT.r80Result]
3560
3561 fninit
3562 add xSP, 20h
3563 EPILOGUE_3_ARGS
3564ENDPROC iemAImpl_ %+ %1 %+ _r80
3565%endmacro
3566
3567IEMIMPL_FPU_R80 fchs
3568IEMIMPL_FPU_R80 fabs
3569IEMIMPL_FPU_R80 f2xm1
3570IEMIMPL_FPU_R80 fsqrt
3571IEMIMPL_FPU_R80 frndint
3572IEMIMPL_FPU_R80 fsin
3573IEMIMPL_FPU_R80 fcos
3574
3575
3576;;
3577; FPU instruction working on one 80-bit floating point value, only
3578; returning FSW.
3579;
3580; @param 1 The instruction
3581; @param 2 Non-zero to also restore FTW.
3582;
3583; @param A0 FPU context (fxsave).
3584; @param A1 Pointer to a uint16_t for the resulting FSW.
3585; @param A2 Pointer to the 80-bit value.
3586;
3587%macro IEMIMPL_FPU_R80_FSW 2
3588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3589 PROLOGUE_3_ARGS
3590 sub xSP, 20h
3591
3592 fninit
3593 fld tword [A2]
3594%if %2 != 0
3595 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3596%else
3597 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3598%endif
3599 %1
3600
3601 fnstsw word [A1]
3602
3603 fninit
3604 add xSP, 20h
3605 EPILOGUE_3_ARGS
3606ENDPROC iemAImpl_ %+ %1 %+ _r80
3607%endmacro
3608
3609IEMIMPL_FPU_R80_FSW ftst, 0
3610IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3611
3612
3613
3614;;
3615; FPU instruction loading a 80-bit floating point constant.
3616;
3617; @param 1 The instruction
3618;
3619; @param A0 FPU context (fxsave).
3620; @param A1 Pointer to a IEMFPURESULT for the output.
3621;
3622%macro IEMIMPL_FPU_R80_CONST 1
3623BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3624 PROLOGUE_2_ARGS
3625 sub xSP, 20h
3626
3627 fninit
3628 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3629 %1
3630
3631 fnstsw word [A1 + IEMFPURESULT.FSW]
3632 fnclex
3633 fstp tword [A1 + IEMFPURESULT.r80Result]
3634
3635 fninit
3636 add xSP, 20h
3637 EPILOGUE_2_ARGS
3638ENDPROC iemAImpl_ %+ %1 %+
3639%endmacro
3640
3641IEMIMPL_FPU_R80_CONST fld1
3642IEMIMPL_FPU_R80_CONST fldl2t
3643IEMIMPL_FPU_R80_CONST fldl2e
3644IEMIMPL_FPU_R80_CONST fldpi
3645IEMIMPL_FPU_R80_CONST fldlg2
3646IEMIMPL_FPU_R80_CONST fldln2
3647IEMIMPL_FPU_R80_CONST fldz
3648
3649
3650;;
3651; FPU instruction working on one 80-bit floating point value, outputing two.
3652;
3653; @param 1 The instruction
3654;
3655; @param A0 FPU context (fxsave).
3656; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3657; @param A2 Pointer to the 80-bit value.
3658;
3659%macro IEMIMPL_FPU_R80_R80 1
3660BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3661 PROLOGUE_3_ARGS
3662 sub xSP, 20h
3663
3664 fninit
3665 fld tword [A2]
3666 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3667 %1
3668
3669 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3670 fnclex
3671 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3672 fnclex
3673 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3674
3675 fninit
3676 add xSP, 20h
3677 EPILOGUE_3_ARGS
3678ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3679%endmacro
3680
3681IEMIMPL_FPU_R80_R80 fptan
3682IEMIMPL_FPU_R80_R80 fxtract
3683IEMIMPL_FPU_R80_R80 fsincos
3684
3685
3686
3687
3688;---------------------- SSE and MMX Operations ----------------------
3689
3690;; @todo what do we need to do for MMX?
3691%macro IEMIMPL_MMX_PROLOGUE 0
3692%endmacro
3693%macro IEMIMPL_MMX_EPILOGUE 0
3694%endmacro
3695
3696;; @todo what do we need to do for SSE?
3697%macro IEMIMPL_SSE_PROLOGUE 0
3698%endmacro
3699%macro IEMIMPL_SSE_EPILOGUE 0
3700%endmacro
3701
3702;; @todo what do we need to do for AVX?
3703%macro IEMIMPL_AVX_PROLOGUE 0
3704%endmacro
3705%macro IEMIMPL_AVX_EPILOGUE 0
3706%endmacro
3707
3708
3709;;
3710; Media instruction working on two full sized registers.
3711;
3712; @param 1 The instruction
3713; @param 2 Whether there is an MMX variant (1) or not (0).
3714;
3715; @param A0 FPU context (fxsave).
3716; @param A1 Pointer to the first media register size operand (input/output).
3717; @param A2 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3722 PROLOGUE_3_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A1]
3726 movq mm1, [A2]
3727 %1 mm0, mm1
3728 movq [A1], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_3_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3736 PROLOGUE_3_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A1]
3740 movdqu xmm1, [A2]
3741 %1 xmm0, xmm1
3742 movdqu [A1], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_3_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_F2 pshufb, 1
3750IEMIMPL_MEDIA_F2 pand, 1
3751IEMIMPL_MEDIA_F2 pandn, 1
3752IEMIMPL_MEDIA_F2 por, 1
3753IEMIMPL_MEDIA_F2 pxor, 1
3754IEMIMPL_MEDIA_F2 pcmpeqb, 1
3755IEMIMPL_MEDIA_F2 pcmpeqw, 1
3756IEMIMPL_MEDIA_F2 pcmpeqd, 1
3757IEMIMPL_MEDIA_F2 pcmpeqq, 0
3758IEMIMPL_MEDIA_F2 pcmpgtb, 1
3759IEMIMPL_MEDIA_F2 pcmpgtw, 1
3760IEMIMPL_MEDIA_F2 pcmpgtd, 1
3761IEMIMPL_MEDIA_F2 pcmpgtq, 0
3762IEMIMPL_MEDIA_F2 paddb, 1
3763IEMIMPL_MEDIA_F2 paddw, 1
3764IEMIMPL_MEDIA_F2 paddd, 1
3765IEMIMPL_MEDIA_F2 paddq, 1
3766IEMIMPL_MEDIA_F2 paddsb, 1
3767IEMIMPL_MEDIA_F2 paddsw, 1
3768IEMIMPL_MEDIA_F2 paddusb, 1
3769IEMIMPL_MEDIA_F2 paddusw, 1
3770IEMIMPL_MEDIA_F2 psubb, 1
3771IEMIMPL_MEDIA_F2 psubw, 1
3772IEMIMPL_MEDIA_F2 psubd, 1
3773IEMIMPL_MEDIA_F2 psubq, 1
3774IEMIMPL_MEDIA_F2 psubsb, 1
3775IEMIMPL_MEDIA_F2 psubsw, 1
3776IEMIMPL_MEDIA_F2 psubusb, 1
3777IEMIMPL_MEDIA_F2 psubusw, 1
3778IEMIMPL_MEDIA_F2 pmullw, 1
3779IEMIMPL_MEDIA_F2 pmulld, 0
3780IEMIMPL_MEDIA_F2 pmulhw, 1
3781IEMIMPL_MEDIA_F2 pmaddwd, 1
3782IEMIMPL_MEDIA_F2 pminub, 1
3783IEMIMPL_MEDIA_F2 pminuw, 0
3784IEMIMPL_MEDIA_F2 pminud, 0
3785IEMIMPL_MEDIA_F2 pminsb, 0
3786IEMIMPL_MEDIA_F2 pminsw, 1
3787IEMIMPL_MEDIA_F2 pminsd, 0
3788IEMIMPL_MEDIA_F2 pmaxub, 1
3789IEMIMPL_MEDIA_F2 pmaxuw, 0
3790IEMIMPL_MEDIA_F2 pmaxud, 0
3791IEMIMPL_MEDIA_F2 pmaxsb, 0
3792IEMIMPL_MEDIA_F2 pmaxsw, 1
3793IEMIMPL_MEDIA_F2 pmaxsd, 0
3794IEMIMPL_MEDIA_F2 pabsb, 1
3795IEMIMPL_MEDIA_F2 pabsw, 1
3796IEMIMPL_MEDIA_F2 pabsd, 1
3797IEMIMPL_MEDIA_F2 psignb, 1
3798IEMIMPL_MEDIA_F2 psignw, 1
3799IEMIMPL_MEDIA_F2 psignd, 1
3800IEMIMPL_MEDIA_F2 phaddw, 1
3801IEMIMPL_MEDIA_F2 phaddd, 1
3802IEMIMPL_MEDIA_F2 phsubw, 1
3803IEMIMPL_MEDIA_F2 phsubd, 1
3804IEMIMPL_MEDIA_F2 phaddsw, 1
3805IEMIMPL_MEDIA_F2 phsubsw, 1
3806IEMIMPL_MEDIA_F2 pmaddubsw, 1
3807IEMIMPL_MEDIA_F2 pmulhrsw, 1
3808IEMIMPL_MEDIA_F2 pmuludq, 1
3809
3810
3811;;
3812; Media instruction working on two full sized registers, but no FXSAVE state argument.
3813;
3814; @param 1 The instruction
3815; @param 2 Whether there is an MMX variant (1) or not (0).
3816;
3817; @param A0 Pointer to the first media register size operand (input/output).
3818; @param A1 Pointer to the second media register size operand (input).
3819;
3820%macro IEMIMPL_MEDIA_OPT_F2 2
3821%if %2 != 0
3822BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3823 PROLOGUE_2_ARGS
3824 IEMIMPL_MMX_PROLOGUE
3825
3826 movq mm0, [A0]
3827 movq mm1, [A1]
3828 %1 mm0, mm1
3829 movq [A0], mm0
3830
3831 IEMIMPL_MMX_EPILOGUE
3832 EPILOGUE_2_ARGS
3833ENDPROC iemAImpl_ %+ %1 %+ _u64
3834%endif
3835
3836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3837 PROLOGUE_2_ARGS
3838 IEMIMPL_SSE_PROLOGUE
3839
3840 movdqu xmm0, [A0]
3841 movdqu xmm1, [A1]
3842 %1 xmm0, xmm1
3843 movdqu [A0], xmm0
3844
3845 IEMIMPL_SSE_EPILOGUE
3846 EPILOGUE_2_ARGS
3847ENDPROC iemAImpl_ %+ %1 %+ _u128
3848%endmacro
3849
3850IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3851IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3852IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3853IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3854IEMIMPL_MEDIA_OPT_F2 psllw, 1
3855IEMIMPL_MEDIA_OPT_F2 pslld, 1
3856IEMIMPL_MEDIA_OPT_F2 psllq, 1
3857IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3858IEMIMPL_MEDIA_OPT_F2 psrld, 1
3859IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3860IEMIMPL_MEDIA_OPT_F2 psraw, 1
3861IEMIMPL_MEDIA_OPT_F2 psrad, 1
3862IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3863IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3864IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3865IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3866IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3867IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3868IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3869IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3870IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3871IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3872IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3873IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3874IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3875IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3876IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3877IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3878IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3879IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3880IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3881IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3882
3883;;
3884; Media instruction working on one full sized and one half sized register (lower half).
3885;
3886; @param 1 The instruction
3887; @param 2 1 if MMX is included, 0 if not.
3888;
3889; @param A0 Pointer to the first full sized media register operand (input/output).
3890; @param A1 Pointer to the second half sized media register operand (input).
3891;
3892%macro IEMIMPL_MEDIA_F1L1 2
3893 %if %2 != 0
3894BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3895 PROLOGUE_2_ARGS
3896 IEMIMPL_MMX_PROLOGUE
3897
3898 movq mm0, [A0]
3899 movq mm1, [A1]
3900 %1 mm0, mm1
3901 movq [A0], mm0
3902
3903 IEMIMPL_MMX_EPILOGUE
3904 EPILOGUE_2_ARGS
3905ENDPROC iemAImpl_ %+ %1 %+ _u64
3906 %endif
3907
3908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3909 PROLOGUE_2_ARGS
3910 IEMIMPL_SSE_PROLOGUE
3911
3912 movdqu xmm0, [A0]
3913 movdqu xmm1, [A1]
3914 %1 xmm0, xmm1
3915 movdqu [A0], xmm0
3916
3917 IEMIMPL_SSE_EPILOGUE
3918 EPILOGUE_2_ARGS
3919ENDPROC iemAImpl_ %+ %1 %+ _u128
3920%endmacro
3921
3922IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3923IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3924IEMIMPL_MEDIA_F1L1 punpckldq, 1
3925IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3926
3927
3928;;
3929; Media instruction working two half sized input registers (lower half) and a full sized
3930; destination register (vpunpckh*).
3931;
3932; @param 1 The instruction
3933;
3934; @param A0 Pointer to the destination register (full sized, output only).
3935; @param A1 Pointer to the first full sized media source register operand, where we
3936; will only use the lower half as input - but we'll be loading it in full.
3937; @param A2 Pointer to the second full sized media source register operand, where we
3938; will only use the lower half as input - but we'll be loading it in full.
3939;
3940%macro IEMIMPL_MEDIA_F1L1L1 1
3941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3942 PROLOGUE_3_ARGS
3943 IEMIMPL_AVX_PROLOGUE
3944
3945 vmovdqu xmm0, [A1]
3946 vmovdqu xmm1, [A2]
3947 %1 xmm0, xmm0, xmm1
3948 vmovdqu [A0], xmm0
3949
3950 IEMIMPL_AVX_PROLOGUE
3951 EPILOGUE_3_ARGS
3952ENDPROC iemAImpl_ %+ %1 %+ _u128
3953
3954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3955 PROLOGUE_3_ARGS
3956 IEMIMPL_AVX_PROLOGUE
3957
3958 vmovdqu ymm0, [A1]
3959 vmovdqu ymm1, [A2]
3960 %1 ymm0, ymm0, ymm1
3961 vmovdqu [A0], ymm0
3962
3963 IEMIMPL_AVX_PROLOGUE
3964 EPILOGUE_3_ARGS
3965ENDPROC iemAImpl_ %+ %1 %+ _u256
3966%endmacro
3967
3968IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3969IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3970IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3971IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3972
3973
3974;;
3975; Media instruction working on one full sized and one half sized register (high half).
3976;
3977; @param 1 The instruction
3978; @param 2 1 if MMX is included, 0 if not.
3979;
3980; @param A0 Pointer to the first full sized media register operand (input/output).
3981; @param A1 Pointer to the second full sized media register operand, where we
3982; will only use the upper half as input - but we'll load it in full.
3983;
3984%macro IEMIMPL_MEDIA_F1H1 2
3985IEMIMPL_MEDIA_F1L1 %1, %2
3986%endmacro
3987
3988IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3989IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3990IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3991IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3992
3993
3994;;
3995; Media instruction working two half sized input registers (high half) and a full sized
3996; destination register (vpunpckh*).
3997;
3998; @param 1 The instruction
3999;
4000; @param A0 Pointer to the destination register (full sized, output only).
4001; @param A1 Pointer to the first full sized media source register operand, where we
4002; will only use the upper half as input - but we'll be loading it in full.
4003; @param A2 Pointer to the second full sized media source register operand, where we
4004; will only use the upper half as input - but we'll be loading it in full.
4005;
4006%macro IEMIMPL_MEDIA_F1H1H1 1
4007IEMIMPL_MEDIA_F1L1L1 %1
4008%endmacro
4009
4010IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4011IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4012IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4013IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4014
4015
4016;
4017; Shufflers with evil 8-bit immediates.
4018;
4019
4020BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4021 PROLOGUE_3_ARGS
4022 IEMIMPL_MMX_PROLOGUE
4023
4024 movzx A2, A2_8 ; must clear top bits
4025 movq mm1, [A1]
4026 movq mm0, mm0 ; paranoia!
4027 lea T1, [.imm0 xWrtRIP]
4028 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4029 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4030 %else
4031 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4032 %endif
4033 lea T1, [T1 + T0]
4034 IBT_NOTRACK
4035 call T1
4036 movq [A0], mm0
4037
4038 IEMIMPL_MMX_EPILOGUE
4039 EPILOGUE_3_ARGS
4040%assign bImm 0
4041%rep 256
4042.imm %+ bImm:
4043 IBT_ENDBRxx_WITHOUT_NOTRACK
4044 pshufw mm0, mm1, bImm
4045 ret
4046 %assign bImm bImm + 1
4047%endrep
4048.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4049ENDPROC iemAImpl_pshufw_u64
4050
4051
4052%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4054 PROLOGUE_3_ARGS
4055 IEMIMPL_SSE_PROLOGUE
4056
4057 movzx A2, A2_8 ; must clear top bits
4058 movdqu xmm1, [A1]
4059 movdqu xmm0, xmm1 ; paranoia!
4060 lea T1, [.imm0 xWrtRIP]
4061 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4062 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4063 %else
4064 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4065 %endif
4066 lea T1, [T1 + T0*2]
4067 IBT_NOTRACK
4068 call T1
4069 movdqu [A0], xmm0
4070
4071 IEMIMPL_SSE_EPILOGUE
4072 EPILOGUE_3_ARGS
4073
4074 %assign bImm 0
4075 %rep 256
4076.imm %+ bImm:
4077 IBT_ENDBRxx_WITHOUT_NOTRACK
4078 %1 xmm0, xmm1, bImm
4079 ret
4080 %assign bImm bImm + 1
4081 %endrep
4082.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4083ENDPROC iemAImpl_ %+ %1 %+ _u128
4084%endmacro
4085
4086IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4087IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4088IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4089
4090
4091%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4092BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4093 PROLOGUE_3_ARGS
4094 IEMIMPL_SSE_PROLOGUE
4095
4096 movzx A2, A2_8 ; must clear top bits
4097 vmovdqu ymm1, [A1]
4098 vmovdqu ymm0, ymm1 ; paranoia!
4099 lea T1, [.imm0 xWrtRIP]
4100 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4101 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4102 %else
4103 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4104 %endif
4105 lea T1, [T1 + T0*2]
4106 IBT_NOTRACK
4107 call T1
4108 vmovdqu [A0], ymm0
4109
4110 IEMIMPL_SSE_EPILOGUE
4111 EPILOGUE_3_ARGS
4112 %assign bImm 0
4113 %rep 256
4114.imm %+ bImm:
4115 IBT_ENDBRxx_WITHOUT_NOTRACK
4116 %1 ymm0, ymm1, bImm
4117 ret
4118 %assign bImm bImm + 1
4119 %endrep
4120.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4121ENDPROC iemAImpl_ %+ %1 %+ _u256
4122%endmacro
4123
4124IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4125IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4126IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4127
4128
4129;
4130; Shifts with evil 8-bit immediates.
4131;
4132
4133%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4134BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4135 PROLOGUE_2_ARGS
4136 IEMIMPL_MMX_PROLOGUE
4137
4138 movzx A1, A1_8 ; must clear top bits
4139 movq mm0, [A0]
4140 lea T1, [.imm0 xWrtRIP]
4141 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4142 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4143 %else
4144 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4145 %endif
4146 lea T1, [T1 + T0]
4147 IBT_NOTRACK
4148 call T1
4149 movq [A0], mm0
4150
4151 IEMIMPL_MMX_EPILOGUE
4152 EPILOGUE_2_ARGS
4153%assign bImm 0
4154%rep 256
4155.imm %+ bImm:
4156 IBT_ENDBRxx_WITHOUT_NOTRACK
4157 %1 mm0, bImm
4158 ret
4159 %assign bImm bImm + 1
4160%endrep
4161.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4162ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4163%endmacro
4164
4165IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4166IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4167IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4168IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4169IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4170IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4171IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4172IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4173
4174
4175%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4176BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4177 PROLOGUE_2_ARGS
4178 IEMIMPL_SSE_PROLOGUE
4179
4180 movzx A1, A1_8 ; must clear top bits
4181 movdqu xmm0, [A0]
4182 lea T1, [.imm0 xWrtRIP]
4183 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4184 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4185 %else
4186 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4187 %endif
4188 lea T1, [T1 + T0*2]
4189 IBT_NOTRACK
4190 call T1
4191 movdqu [A0], xmm0
4192
4193 IEMIMPL_SSE_EPILOGUE
4194 EPILOGUE_2_ARGS
4195 %assign bImm 0
4196 %rep 256
4197.imm %+ bImm:
4198 IBT_ENDBRxx_WITHOUT_NOTRACK
4199 %1 xmm0, bImm
4200 ret
4201 %assign bImm bImm + 1
4202 %endrep
4203.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4204ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4205%endmacro
4206
4207IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4208IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4209IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4210IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4211IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4212IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4213IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4214IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4215IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4216IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4217
4218
4219;
4220; Move byte mask.
4221;
4222
4223BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4224 PROLOGUE_2_ARGS
4225 IEMIMPL_MMX_PROLOGUE
4226
4227 movq mm1, [A1]
4228 pmovmskb T0, mm1
4229 mov [A0], T0
4230%ifdef RT_ARCH_X86
4231 mov dword [A0 + 4], 0
4232%endif
4233 IEMIMPL_MMX_EPILOGUE
4234 EPILOGUE_2_ARGS
4235ENDPROC iemAImpl_pmovmskb_u64
4236
4237BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4238 PROLOGUE_2_ARGS
4239 IEMIMPL_SSE_PROLOGUE
4240
4241 movdqu xmm1, [A1]
4242 pmovmskb T0, xmm1
4243 mov [A0], T0
4244%ifdef RT_ARCH_X86
4245 mov dword [A0 + 4], 0
4246%endif
4247 IEMIMPL_SSE_EPILOGUE
4248 EPILOGUE_2_ARGS
4249ENDPROC iemAImpl_pmovmskb_u128
4250
4251BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4252 PROLOGUE_2_ARGS
4253 IEMIMPL_AVX_PROLOGUE
4254
4255 vmovdqu ymm1, [A1]
4256 vpmovmskb T0, ymm1
4257 mov [A0], T0
4258%ifdef RT_ARCH_X86
4259 mov dword [A0 + 4], 0
4260%endif
4261 IEMIMPL_AVX_EPILOGUE
4262 EPILOGUE_2_ARGS
4263ENDPROC iemAImpl_vpmovmskb_u256
4264
4265
4266;;
4267; Media instruction working on two full sized source registers and one destination (AVX).
4268;
4269; @param 1 The instruction
4270;
4271; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4272; @param A1 Pointer to the destination media register size operand (output).
4273; @param A2 Pointer to the first source media register size operand (input).
4274; @param A3 Pointer to the second source media register size operand (input).
4275;
4276%macro IEMIMPL_MEDIA_F3 1
4277BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4278 PROLOGUE_4_ARGS
4279 IEMIMPL_AVX_PROLOGUE
4280
4281 vmovdqu xmm0, [A2]
4282 vmovdqu xmm1, [A3]
4283 %1 xmm0, xmm0, xmm1
4284 vmovdqu [A1], xmm0
4285
4286 IEMIMPL_AVX_PROLOGUE
4287 EPILOGUE_4_ARGS
4288ENDPROC iemAImpl_ %+ %1 %+ _u128
4289
4290BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4291 PROLOGUE_4_ARGS
4292 IEMIMPL_AVX_PROLOGUE
4293
4294 vmovdqu ymm0, [A2]
4295 vmovdqu ymm1, [A3]
4296 %1 ymm0, ymm0, ymm1
4297 vmovdqu [A1], ymm0
4298
4299 IEMIMPL_AVX_PROLOGUE
4300 EPILOGUE_4_ARGS
4301ENDPROC iemAImpl_ %+ %1 %+ _u256
4302%endmacro
4303
4304IEMIMPL_MEDIA_F3 vpshufb
4305IEMIMPL_MEDIA_F3 vpand
4306IEMIMPL_MEDIA_F3 vpminub
4307IEMIMPL_MEDIA_F3 vpminuw
4308IEMIMPL_MEDIA_F3 vpminud
4309IEMIMPL_MEDIA_F3 vpminsb
4310IEMIMPL_MEDIA_F3 vpminsw
4311IEMIMPL_MEDIA_F3 vpminsd
4312IEMIMPL_MEDIA_F3 vpmaxub
4313IEMIMPL_MEDIA_F3 vpmaxuw
4314IEMIMPL_MEDIA_F3 vpmaxud
4315IEMIMPL_MEDIA_F3 vpmaxsb
4316IEMIMPL_MEDIA_F3 vpmaxsw
4317IEMIMPL_MEDIA_F3 vpmaxsd
4318IEMIMPL_MEDIA_F3 vpandn
4319IEMIMPL_MEDIA_F3 vpor
4320IEMIMPL_MEDIA_F3 vpxor
4321IEMIMPL_MEDIA_F3 vpcmpeqb
4322IEMIMPL_MEDIA_F3 vpcmpeqw
4323IEMIMPL_MEDIA_F3 vpcmpeqd
4324IEMIMPL_MEDIA_F3 vpcmpeqq
4325IEMIMPL_MEDIA_F3 vpcmpgtb
4326IEMIMPL_MEDIA_F3 vpcmpgtw
4327IEMIMPL_MEDIA_F3 vpcmpgtd
4328IEMIMPL_MEDIA_F3 vpcmpgtq
4329IEMIMPL_MEDIA_F3 vpaddb
4330IEMIMPL_MEDIA_F3 vpaddw
4331IEMIMPL_MEDIA_F3 vpaddd
4332IEMIMPL_MEDIA_F3 vpaddq
4333IEMIMPL_MEDIA_F3 vpsubb
4334IEMIMPL_MEDIA_F3 vpsubw
4335IEMIMPL_MEDIA_F3 vpsubd
4336IEMIMPL_MEDIA_F3 vpsubq
4337
4338
4339;;
4340; Media instruction working on two full sized source registers and one destination (AVX),
4341; but no XSAVE state pointer argument.
4342;
4343; @param 1 The instruction
4344;
4345; @param A0 Pointer to the destination media register size operand (output).
4346; @param A1 Pointer to the first source media register size operand (input).
4347; @param A2 Pointer to the second source media register size operand (input).
4348;
4349%macro IEMIMPL_MEDIA_OPT_F3 1
4350BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4351 PROLOGUE_3_ARGS
4352 IEMIMPL_AVX_PROLOGUE
4353
4354 vmovdqu xmm0, [A1]
4355 vmovdqu xmm1, [A2]
4356 %1 xmm0, xmm0, xmm1
4357 vmovdqu [A0], xmm0
4358
4359 IEMIMPL_AVX_PROLOGUE
4360 EPILOGUE_3_ARGS
4361ENDPROC iemAImpl_ %+ %1 %+ _u128
4362
4363BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4364 PROLOGUE_3_ARGS
4365 IEMIMPL_AVX_PROLOGUE
4366
4367 vmovdqu ymm0, [A1]
4368 vmovdqu ymm1, [A2]
4369 %1 ymm0, ymm0, ymm1
4370 vmovdqu [A0], ymm0
4371
4372 IEMIMPL_AVX_PROLOGUE
4373 EPILOGUE_3_ARGS
4374ENDPROC iemAImpl_ %+ %1 %+ _u256
4375%endmacro
4376
4377IEMIMPL_MEDIA_OPT_F3 vpacksswb
4378IEMIMPL_MEDIA_OPT_F3 vpackssdw
4379IEMIMPL_MEDIA_OPT_F3 vpackuswb
4380IEMIMPL_MEDIA_OPT_F3 vpackusdw
4381IEMIMPL_MEDIA_OPT_F3 vpmullw
4382IEMIMPL_MEDIA_OPT_F3 vpmulld
4383IEMIMPL_MEDIA_OPT_F3 vpmulhw
4384IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4385IEMIMPL_MEDIA_OPT_F3 vpavgb
4386IEMIMPL_MEDIA_OPT_F3 vpavgw
4387IEMIMPL_MEDIA_OPT_F3 vpsignb
4388IEMIMPL_MEDIA_OPT_F3 vpsignw
4389IEMIMPL_MEDIA_OPT_F3 vpsignd
4390IEMIMPL_MEDIA_OPT_F3 vphaddw
4391IEMIMPL_MEDIA_OPT_F3 vphaddd
4392IEMIMPL_MEDIA_OPT_F3 vphsubw
4393IEMIMPL_MEDIA_OPT_F3 vphsubd
4394IEMIMPL_MEDIA_OPT_F3 vphaddsw
4395IEMIMPL_MEDIA_OPT_F3 vphsubsw
4396IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4397IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4398IEMIMPL_MEDIA_OPT_F3 vpsadbw
4399IEMIMPL_MEDIA_OPT_F3 vpmuldq
4400IEMIMPL_MEDIA_OPT_F3 vpmuludq
4401IEMIMPL_MEDIA_OPT_F3 vunpcklps
4402IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4403IEMIMPL_MEDIA_OPT_F3 vunpckhps
4404IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4405IEMIMPL_MEDIA_OPT_F3 vpsubsb
4406IEMIMPL_MEDIA_OPT_F3 vpsubsw
4407IEMIMPL_MEDIA_OPT_F3 vpsubusb
4408IEMIMPL_MEDIA_OPT_F3 vpsubusw
4409IEMIMPL_MEDIA_OPT_F3 vpaddusb
4410IEMIMPL_MEDIA_OPT_F3 vpaddusw
4411IEMIMPL_MEDIA_OPT_F3 vpaddsb
4412IEMIMPL_MEDIA_OPT_F3 vpaddsw
4413
4414
4415;;
4416; Media instruction working on one full sized source registers and one destination (AVX),
4417; but no XSAVE state pointer argument.
4418;
4419; @param 1 The instruction
4420; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4421;
4422; @param A0 Pointer to the destination media register size operand (output).
4423; @param A1 Pointer to the source media register size operand (input).
4424;
4425%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4426BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4427 PROLOGUE_2_ARGS
4428 IEMIMPL_AVX_PROLOGUE
4429
4430 vmovdqu xmm0, [A1]
4431 %1 xmm0, xmm0
4432 vmovdqu [A0], xmm0
4433
4434 IEMIMPL_AVX_PROLOGUE
4435 EPILOGUE_2_ARGS
4436ENDPROC iemAImpl_ %+ %1 %+ _u128
4437
4438 %if %2 == 1
4439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4440 PROLOGUE_2_ARGS
4441 IEMIMPL_AVX_PROLOGUE
4442
4443 vmovdqu ymm0, [A1]
4444 %1 ymm0, ymm0
4445 vmovdqu [A0], ymm0
4446
4447 IEMIMPL_AVX_PROLOGUE
4448 EPILOGUE_2_ARGS
4449ENDPROC iemAImpl_ %+ %1 %+ _u256
4450 %endif
4451%endmacro
4452
4453IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4454IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4455IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4456IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4457
4458
4459;
4460; The SSE 4.2 crc32
4461;
4462; @param A1 Pointer to the 32-bit destination.
4463; @param A2 The source operand, sized according to the suffix.
4464;
4465BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4466 PROLOGUE_2_ARGS
4467
4468 mov T0_32, [A0]
4469 crc32 T0_32, A1_8
4470 mov [A0], T0_32
4471
4472 EPILOGUE_2_ARGS
4473ENDPROC iemAImpl_crc32_u8
4474
4475BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4476 PROLOGUE_2_ARGS
4477
4478 mov T0_32, [A0]
4479 crc32 T0_32, A1_16
4480 mov [A0], T0_32
4481
4482 EPILOGUE_2_ARGS
4483ENDPROC iemAImpl_crc32_u16
4484
4485BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4486 PROLOGUE_2_ARGS
4487
4488 mov T0_32, [A0]
4489 crc32 T0_32, A1_32
4490 mov [A0], T0_32
4491
4492 EPILOGUE_2_ARGS
4493ENDPROC iemAImpl_crc32_u32
4494
4495%ifdef RT_ARCH_AMD64
4496BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4497 PROLOGUE_2_ARGS
4498
4499 mov T0_32, [A0]
4500 crc32 T0, A1
4501 mov [A0], T0_32
4502
4503 EPILOGUE_2_ARGS
4504ENDPROC iemAImpl_crc32_u64
4505%endif
4506
4507
4508;
4509; PTEST (SSE 4.1)
4510;
4511; @param A0 Pointer to the first source operand (aka readonly destination).
4512; @param A1 Pointer to the second source operand.
4513; @param A2 Pointer to the EFLAGS register.
4514;
4515BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4516 PROLOGUE_3_ARGS
4517 IEMIMPL_SSE_PROLOGUE
4518
4519 movdqu xmm0, [A0]
4520 movdqu xmm1, [A1]
4521 ptest xmm0, xmm1
4522 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4523
4524 IEMIMPL_SSE_EPILOGUE
4525 EPILOGUE_3_ARGS
4526ENDPROC iemAImpl_ptest_u128
4527
4528BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4529 PROLOGUE_3_ARGS
4530 IEMIMPL_SSE_PROLOGUE
4531
4532 vmovdqu ymm0, [A0]
4533 vmovdqu ymm1, [A1]
4534 vptest ymm0, ymm1
4535 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4536
4537 IEMIMPL_SSE_EPILOGUE
4538 EPILOGUE_3_ARGS
4539ENDPROC iemAImpl_vptest_u256
4540
4541
4542;;
4543; Template for the [v]pmov{s,z}x* instructions
4544;
4545; @param 1 The instruction
4546;
4547; @param A0 Pointer to the destination media register size operand (output).
4548; @param A1 The source operand value (input).
4549;
4550%macro IEMIMPL_V_PMOV_SZ_X 1
4551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4552 PROLOGUE_2_ARGS
4553 IEMIMPL_SSE_PROLOGUE
4554
4555 movd xmm0, A1
4556 %1 xmm0, xmm0
4557 vmovdqu [A0], xmm0
4558
4559 IEMIMPL_SSE_PROLOGUE
4560 EPILOGUE_2_ARGS
4561ENDPROC iemAImpl_ %+ %1 %+ _u128
4562
4563BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4564 PROLOGUE_2_ARGS
4565 IEMIMPL_AVX_PROLOGUE
4566
4567 movd xmm0, A1
4568 v %+ %1 xmm0, xmm0
4569 vmovdqu [A0], xmm0
4570
4571 IEMIMPL_AVX_PROLOGUE
4572 EPILOGUE_2_ARGS
4573ENDPROC iemAImpl_v %+ %1 %+ _u128
4574
4575BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4576 PROLOGUE_2_ARGS
4577 IEMIMPL_AVX_PROLOGUE
4578
4579 movdqu xmm0, [A1]
4580 v %+ %1 ymm0, xmm0
4581 vmovdqu [A0], ymm0
4582
4583 IEMIMPL_AVX_PROLOGUE
4584 EPILOGUE_2_ARGS
4585ENDPROC iemAImpl_v %+ %1 %+ _u256
4586%endmacro
4587
4588IEMIMPL_V_PMOV_SZ_X pmovsxbw
4589IEMIMPL_V_PMOV_SZ_X pmovsxbd
4590IEMIMPL_V_PMOV_SZ_X pmovsxbq
4591IEMIMPL_V_PMOV_SZ_X pmovsxwd
4592IEMIMPL_V_PMOV_SZ_X pmovsxwq
4593IEMIMPL_V_PMOV_SZ_X pmovsxdq
4594
4595IEMIMPL_V_PMOV_SZ_X pmovzxbw
4596IEMIMPL_V_PMOV_SZ_X pmovzxbd
4597IEMIMPL_V_PMOV_SZ_X pmovzxbq
4598IEMIMPL_V_PMOV_SZ_X pmovzxwd
4599IEMIMPL_V_PMOV_SZ_X pmovzxwq
4600IEMIMPL_V_PMOV_SZ_X pmovzxdq
4601
4602
4603;;
4604; Need to move this as well somewhere better?
4605;
4606struc IEMSSERESULT
4607 .uResult resd 4
4608 .MXCSR resd 1
4609endstruc
4610
4611
4612;;
4613; Need to move this as well somewhere better?
4614;
4615struc IEMAVX128RESULT
4616 .uResult resd 4
4617 .MXCSR resd 1
4618endstruc
4619
4620
4621;;
4622; Need to move this as well somewhere better?
4623;
4624struc IEMAVX256RESULT
4625 .uResult resd 8
4626 .MXCSR resd 1
4627endstruc
4628
4629
4630;;
4631; Initialize the SSE MXCSR register using the guest value partially to
4632; account for rounding mode.
4633;
4634; @uses 4 bytes of stack to save the original value, T0.
4635; @param 1 Expression giving the address of the FXSTATE of the guest.
4636;
4637%macro SSE_LD_FXSTATE_MXCSR 1
4638 sub xSP, 4
4639
4640 stmxcsr [xSP]
4641 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4642 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4643 or T0_32, X86_MXCSR_XCPT_MASK
4644 sub xSP, 4
4645 mov [xSP], T0_32
4646 ldmxcsr [xSP]
4647 add xSP, 4
4648%endmacro
4649
4650
4651;;
4652; Restores the SSE MXCSR register with the original value.
4653;
4654; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4655; @param 1 Expression giving the address where to return the MXCSR value.
4656; @param 2 Expression giving the address of the FXSTATE of the guest.
4657;
4658; @note Restores the stack pointer.
4659;
4660%macro SSE_ST_FXSTATE_MXCSR 2
4661 sub xSP, 4
4662 stmxcsr [xSP]
4663 mov T0_32, [xSP]
4664 add xSP, 4
4665 ; Merge the status bits into the original MXCSR value.
4666 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4667 and T0_32, X86_MXCSR_XCPT_FLAGS
4668 or T0_32, T1_32
4669 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4670
4671 ldmxcsr [xSP]
4672 add xSP, 4
4673%endmacro
4674
4675
4676;;
4677; Initialize the SSE MXCSR register using the guest value partially to
4678; account for rounding mode.
4679;
4680; @uses 4 bytes of stack to save the original value.
4681; @param 1 Expression giving the address of the FXSTATE of the guest.
4682;
4683%macro AVX_LD_XSAVEAREA_MXCSR 1
4684 sub xSP, 4
4685
4686 stmxcsr [xSP]
4687 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4688 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4689 sub xSP, 4
4690 mov [xSP], T0_32
4691 ldmxcsr [xSP]
4692 add xSP, 4
4693%endmacro
4694
4695
4696;;
4697; Restores the AVX128 MXCSR register with the original value.
4698;
4699; @param 1 Expression giving the address where to return the MXCSR value.
4700;
4701; @note Restores the stack pointer.
4702;
4703%macro AVX128_ST_XSAVEAREA_MXCSR 1
4704 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4705
4706 ldmxcsr [xSP]
4707 add xSP, 4
4708%endmacro
4709
4710
4711;;
4712; Restores the AVX256 MXCSR register with the original value.
4713;
4714; @param 1 Expression giving the address where to return the MXCSR value.
4715;
4716; @note Restores the stack pointer.
4717;
4718%macro AVX256_ST_XSAVEAREA_MXCSR 1
4719 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4720
4721 ldmxcsr [xSP]
4722 add xSP, 4
4723%endmacro
4724
4725
4726;;
4727; Floating point instruction working on two full sized registers.
4728;
4729; @param 1 The instruction
4730; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4731;
4732; @param A0 FPU context (FXSTATE or XSAVEAREA).
4733; @param A1 Where to return the result including the MXCSR value.
4734; @param A2 Pointer to the first media register size operand (input/output).
4735; @param A3 Pointer to the second media register size operand (input).
4736;
4737%macro IEMIMPL_FP_F2 2
4738BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4739 PROLOGUE_4_ARGS
4740 IEMIMPL_SSE_PROLOGUE
4741 SSE_LD_FXSTATE_MXCSR A0
4742
4743 movdqu xmm0, [A2]
4744 movdqu xmm1, [A3]
4745 %1 xmm0, xmm1
4746 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4747
4748 SSE_ST_FXSTATE_MXCSR A1, A0
4749 IEMIMPL_SSE_PROLOGUE
4750 EPILOGUE_4_ARGS
4751ENDPROC iemAImpl_ %+ %1 %+ _u128
4752
4753 %if %2 == 3
4754BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4755 PROLOGUE_4_ARGS
4756 IEMIMPL_AVX_PROLOGUE
4757 AVX_LD_XSAVEAREA_MXCSR A0
4758
4759 vmovdqu xmm0, [A2]
4760 vmovdqu xmm1, [A3]
4761 v %+ %1 xmm0, xmm0, xmm1
4762 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4763
4764 AVX128_ST_XSAVEAREA_MXCSR A1
4765 IEMIMPL_AVX_PROLOGUE
4766 EPILOGUE_4_ARGS
4767ENDPROC iemAImpl_v %+ %1 %+ _u128
4768
4769BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4770 PROLOGUE_4_ARGS
4771 IEMIMPL_AVX_PROLOGUE
4772 AVX_LD_XSAVEAREA_MXCSR A0
4773
4774 vmovdqu ymm0, [A2]
4775 vmovdqu ymm1, [A3]
4776 v %+ %1 ymm0, ymm0, ymm1
4777 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4778
4779 AVX256_ST_XSAVEAREA_MXCSR A1
4780 IEMIMPL_AVX_PROLOGUE
4781 EPILOGUE_4_ARGS
4782ENDPROC iemAImpl_v %+ %1 %+ _u256
4783 %elif %2 == 2
4784BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4785 PROLOGUE_4_ARGS
4786 IEMIMPL_AVX_PROLOGUE
4787 AVX_LD_XSAVEAREA_MXCSR A0
4788
4789 vmovdqu xmm0, [A2]
4790 vmovdqu xmm1, [A3]
4791 v %+ %1 xmm0, xmm1
4792 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4793
4794 AVX128_ST_XSAVEAREA_MXCSR A1
4795 IEMIMPL_AVX_PROLOGUE
4796 EPILOGUE_4_ARGS
4797ENDPROC iemAImpl_v %+ %1 %+ _u128
4798
4799BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4800 PROLOGUE_4_ARGS
4801 IEMIMPL_AVX_PROLOGUE
4802 AVX_LD_XSAVEAREA_MXCSR A0
4803
4804 vmovdqu ymm0, [A2]
4805 vmovdqu ymm1, [A3]
4806 v %+ %1 ymm0, ymm1
4807 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4808
4809 AVX256_ST_XSAVEAREA_MXCSR A1
4810 IEMIMPL_AVX_PROLOGUE
4811 EPILOGUE_4_ARGS
4812ENDPROC iemAImpl_v %+ %1 %+ _u256
4813 %endif
4814%endmacro
4815
4816IEMIMPL_FP_F2 addps, 3
4817IEMIMPL_FP_F2 addpd, 3
4818IEMIMPL_FP_F2 mulps, 3
4819IEMIMPL_FP_F2 mulpd, 3
4820IEMIMPL_FP_F2 subps, 3
4821IEMIMPL_FP_F2 subpd, 3
4822IEMIMPL_FP_F2 minps, 3
4823IEMIMPL_FP_F2 minpd, 3
4824IEMIMPL_FP_F2 divps, 3
4825IEMIMPL_FP_F2 divpd, 3
4826IEMIMPL_FP_F2 maxps, 3
4827IEMIMPL_FP_F2 maxpd, 3
4828IEMIMPL_FP_F2 haddps, 3
4829IEMIMPL_FP_F2 haddpd, 3
4830IEMIMPL_FP_F2 hsubps, 3
4831IEMIMPL_FP_F2 hsubpd, 3
4832IEMIMPL_FP_F2 addsubps, 3
4833IEMIMPL_FP_F2 addsubpd, 3
4834
4835
4836;;
4837; These are actually unary operations but to keep it simple
4838; we treat them as binary for now, so the output result is
4839; always in sync with the register where the result might get written
4840; to.
4841IEMIMPL_FP_F2 sqrtps, 2
4842IEMIMPL_FP_F2 rsqrtps, 2
4843IEMIMPL_FP_F2 sqrtpd, 2
4844IEMIMPL_FP_F2 cvtdq2ps, 2
4845IEMIMPL_FP_F2 cvtps2dq, 2
4846IEMIMPL_FP_F2 cvttps2dq, 2
4847IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4848IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4849IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4850
4851
4852;;
4853; Floating point instruction working on a full sized register and a single precision operand.
4854;
4855; @param 1 The instruction
4856;
4857; @param A0 FPU context (FXSTATE or XSAVEAREA).
4858; @param A1 Where to return the result including the MXCSR value.
4859; @param A2 Pointer to the first media register size operand (input/output).
4860; @param A3 Pointer to the second single precision floating point value (input).
4861;
4862%macro IEMIMPL_FP_F2_R32 1
4863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4864 PROLOGUE_4_ARGS
4865 IEMIMPL_SSE_PROLOGUE
4866 SSE_LD_FXSTATE_MXCSR A0
4867
4868 movdqu xmm0, [A2]
4869 movd xmm1, [A3]
4870 %1 xmm0, xmm1
4871 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4872
4873 SSE_ST_FXSTATE_MXCSR A1, A0
4874 IEMIMPL_SSE_EPILOGUE
4875 EPILOGUE_4_ARGS
4876ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4877
4878BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4879 PROLOGUE_4_ARGS
4880 IEMIMPL_AVX_PROLOGUE
4881 AVX_LD_XSAVEAREA_MXCSR A0
4882
4883 vmovdqu xmm0, [A2]
4884 vmovd xmm1, [A3]
4885 v %+ %1 xmm0, xmm0, xmm1
4886 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4887
4888 AVX128_ST_XSAVEAREA_MXCSR A1
4889 IEMIMPL_AVX_PROLOGUE
4890 EPILOGUE_4_ARGS
4891ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4892%endmacro
4893
4894IEMIMPL_FP_F2_R32 addss
4895IEMIMPL_FP_F2_R32 mulss
4896IEMIMPL_FP_F2_R32 subss
4897IEMIMPL_FP_F2_R32 minss
4898IEMIMPL_FP_F2_R32 divss
4899IEMIMPL_FP_F2_R32 maxss
4900IEMIMPL_FP_F2_R32 cvtss2sd
4901IEMIMPL_FP_F2_R32 sqrtss
4902IEMIMPL_FP_F2_R32 rsqrtss
4903
4904
4905;;
4906; Floating point instruction working on a full sized register and a double precision operand.
4907;
4908; @param 1 The instruction
4909;
4910; @param A0 FPU context (FXSTATE or XSAVEAREA).
4911; @param A1 Where to return the result including the MXCSR value.
4912; @param A2 Pointer to the first media register size operand (input/output).
4913; @param A3 Pointer to the second double precision floating point value (input).
4914;
4915%macro IEMIMPL_FP_F2_R64 1
4916BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4917 PROLOGUE_4_ARGS
4918 IEMIMPL_SSE_PROLOGUE
4919 SSE_LD_FXSTATE_MXCSR A0
4920
4921 movdqu xmm0, [A2]
4922 movq xmm1, [A3]
4923 %1 xmm0, xmm1
4924 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4925
4926 SSE_ST_FXSTATE_MXCSR A1, A0
4927 IEMIMPL_SSE_EPILOGUE
4928 EPILOGUE_4_ARGS
4929ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4930
4931BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4932 PROLOGUE_4_ARGS
4933 IEMIMPL_AVX_PROLOGUE
4934 AVX_LD_XSAVEAREA_MXCSR A0
4935
4936 vmovdqu xmm0, [A2]
4937 vmovq xmm1, [A3]
4938 v %+ %1 xmm0, xmm0, xmm1
4939 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4940
4941 AVX128_ST_XSAVEAREA_MXCSR A1
4942 IEMIMPL_AVX_EPILOGUE
4943 EPILOGUE_4_ARGS
4944ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4945%endmacro
4946
4947IEMIMPL_FP_F2_R64 addsd
4948IEMIMPL_FP_F2_R64 mulsd
4949IEMIMPL_FP_F2_R64 subsd
4950IEMIMPL_FP_F2_R64 minsd
4951IEMIMPL_FP_F2_R64 divsd
4952IEMIMPL_FP_F2_R64 maxsd
4953IEMIMPL_FP_F2_R64 cvtsd2ss
4954IEMIMPL_FP_F2_R64 sqrtsd
4955
4956
4957;;
4958; Macro for the cvtpd2ps/cvtps2pd instructions.
4959;
4960; 1 The instruction name.
4961; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4962;
4963; @param A0 FPU context (FXSTATE or XSAVEAREA).
4964; @param A1 Where to return the result including the MXCSR value.
4965; @param A2 Pointer to the first media register size operand (input/output).
4966; @param A3 Pointer to the second media register size operand (input).
4967;
4968%macro IEMIMPL_CVT_F2 2
4969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4970 PROLOGUE_4_ARGS
4971 IEMIMPL_SSE_PROLOGUE
4972 SSE_LD_FXSTATE_MXCSR A0
4973
4974 movdqu xmm0, [A2]
4975 movdqu xmm1, [A3]
4976 %1 xmm0, xmm1
4977 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4978
4979 SSE_ST_FXSTATE_MXCSR A1, A0
4980 IEMIMPL_SSE_EPILOGUE
4981 EPILOGUE_4_ARGS
4982ENDPROC iemAImpl_ %+ %1 %+ _u128
4983
4984BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4985 PROLOGUE_4_ARGS
4986 IEMIMPL_AVX_PROLOGUE
4987 AVX_LD_XSAVEAREA_MXCSR A0
4988
4989 vmovdqu xmm0, [A2]
4990 vmovdqu xmm1, [A3]
4991 v %+ %1 xmm0, xmm1
4992 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4993
4994 AVX128_ST_XSAVEAREA_MXCSR A1
4995 IEMIMPL_AVX_EPILOGUE
4996 EPILOGUE_4_ARGS
4997ENDPROC iemAImpl_v %+ %1 %+ _u128
4998
4999BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5000 PROLOGUE_4_ARGS
5001 IEMIMPL_AVX_PROLOGUE
5002 AVX_LD_XSAVEAREA_MXCSR A0
5003
5004 vmovdqu ymm0, [A2]
5005 vmovdqu ymm1, [A3]
5006 %if %2 == 0
5007 v %+ %1 xmm0, ymm1
5008 %else
5009 v %+ %1 ymm0, xmm1
5010 %endif
5011 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5012
5013 AVX256_ST_XSAVEAREA_MXCSR A1
5014 IEMIMPL_AVX_EPILOGUE
5015 EPILOGUE_4_ARGS
5016ENDPROC iemAImpl_v %+ %1 %+ _u256
5017%endmacro
5018
5019IEMIMPL_CVT_F2 cvtpd2ps, 0
5020IEMIMPL_CVT_F2 cvtps2pd, 1
5021
5022
5023;;
5024; shufps instructions with 8-bit immediates.
5025;
5026; @param A0 Pointer to the destination media register size operand (input/output).
5027; @param A1 Pointer to the first source media register size operand (input).
5028; @param A2 The 8-bit immediate
5029;
5030BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5031 PROLOGUE_3_ARGS
5032 IEMIMPL_SSE_PROLOGUE
5033
5034 movzx A2, A2_8 ; must clear top bits
5035 movdqu xmm0, [A0]
5036 movdqu xmm1, [A1]
5037 lea T1, [.imm0 xWrtRIP]
5038 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5039 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5040 %else
5041 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5042 %endif
5043 lea T1, [T1 + T0*2]
5044 IBT_NOTRACK
5045 call T1
5046 movdqu [A0], xmm0
5047
5048 IEMIMPL_SSE_EPILOGUE
5049 EPILOGUE_3_ARGS
5050 %assign bImm 0
5051 %rep 256
5052.imm %+ bImm:
5053 IBT_ENDBRxx_WITHOUT_NOTRACK
5054 shufps xmm0, xmm1, bImm
5055 ret
5056 int3
5057 %assign bImm bImm + 1
5058 %endrep
5059.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5060ENDPROC iemAImpl_shufps_u128
5061
5062
5063;;
5064; shufpd instruction with 8-bit immediates.
5065;
5066; @param A0 Pointer to the destination media register size operand (input/output).
5067; @param A1 Pointer to the first source media register size operand (input).
5068; @param A2 The 8-bit immediate
5069;
5070BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5071 PROLOGUE_3_ARGS
5072 IEMIMPL_SSE_PROLOGUE
5073
5074 movzx A2, A2_8 ; must clear top bits
5075 movdqu xmm0, [A0]
5076 movdqu xmm1, [A1]
5077 lea T1, [.imm0 xWrtRIP]
5078 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5079 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5080 %else
5081 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5082 %endif
5083 lea T1, [T1 + T0*2]
5084 IBT_NOTRACK
5085 call T1
5086 movdqu [A0], xmm0
5087
5088 IEMIMPL_SSE_EPILOGUE
5089 EPILOGUE_3_ARGS
5090 %assign bImm 0
5091 %rep 256
5092.imm %+ bImm:
5093 IBT_ENDBRxx_WITHOUT_NOTRACK
5094 shufpd xmm0, xmm1, bImm
5095 ret
5096 %assign bImm bImm + 1
5097 %endrep
5098.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5099ENDPROC iemAImpl_shufpd_u128
5100
5101
5102;;
5103; vshufp{s,d} instructions with 8-bit immediates.
5104;
5105; @param 1 The instruction name.
5106;
5107; @param A0 Pointer to the destination media register size operand (output).
5108; @param A1 Pointer to the first source media register size operand (input).
5109; @param A2 Pointer to the second source media register size operand (input).
5110; @param A3 The 8-bit immediate
5111;
5112%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5114 PROLOGUE_4_ARGS
5115 IEMIMPL_AVX_PROLOGUE
5116
5117 movzx A3, A3_8 ; must clear top bits
5118 movdqu xmm0, [A1]
5119 movdqu xmm1, [A2]
5120 lea T1, [.imm0 xWrtRIP]
5121 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5122 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5123 %else
5124 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5125 %endif
5126 lea T1, [T1 + T0*2]
5127 IBT_NOTRACK
5128 call T1
5129 movdqu [A0], xmm0
5130
5131 IEMIMPL_AVX_EPILOGUE
5132 EPILOGUE_4_ARGS
5133 %assign bImm 0
5134 %rep 256
5135.imm %+ bImm:
5136 IBT_ENDBRxx_WITHOUT_NOTRACK
5137 %1 xmm0, xmm0, xmm1, bImm
5138 ret
5139 %assign bImm bImm + 1
5140 %endrep
5141.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5142ENDPROC iemAImpl_ %+ %1 %+ _u128
5143
5144BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5145 PROLOGUE_4_ARGS
5146 IEMIMPL_AVX_PROLOGUE
5147
5148 movzx A3, A3_8 ; must clear top bits
5149 vmovdqu ymm0, [A1]
5150 vmovdqu ymm1, [A2]
5151 lea T1, [.imm0 xWrtRIP]
5152 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5153 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5154 %else
5155 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5156 %endif
5157 lea T1, [T1 + T0*2]
5158 IBT_NOTRACK
5159 call T1
5160 vmovdqu [A0], ymm0
5161
5162 IEMIMPL_AVX_EPILOGUE
5163 EPILOGUE_4_ARGS
5164 %assign bImm 0
5165 %rep 256
5166.imm %+ bImm:
5167 IBT_ENDBRxx_WITHOUT_NOTRACK
5168 %1 ymm0, ymm0, ymm1, bImm
5169 ret
5170 %assign bImm bImm + 1
5171 %endrep
5172.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5173ENDPROC iemAImpl_ %+ %1 %+ _u256
5174%endmacro
5175
5176IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5177IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5178
5179
5180;;
5181; One of the [p]blendv{b,ps,pd} variants
5182;
5183; @param 1 The instruction
5184;
5185; @param A0 Pointer to the first media register sized operand (input/output).
5186; @param A1 Pointer to the second media sized value (input).
5187; @param A2 Pointer to the media register sized mask value (input).
5188;
5189%macro IEMIMPL_P_BLEND 1
5190BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5191 PROLOGUE_3_ARGS
5192 IEMIMPL_SSE_PROLOGUE
5193
5194 movdqu xmm0, [A2] ; This is implicit
5195 movdqu xmm1, [A0]
5196 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5197 %1 xmm1, xmm2
5198 movdqu [A0], xmm1
5199
5200 IEMIMPL_SSE_PROLOGUE
5201 EPILOGUE_3_ARGS
5202ENDPROC iemAImpl_ %+ %1 %+ _u128
5203%endmacro
5204
5205IEMIMPL_P_BLEND pblendvb
5206IEMIMPL_P_BLEND blendvps
5207IEMIMPL_P_BLEND blendvpd
5208
5209
5210;;
5211; One of the v[p]blendv{b,ps,pd} variants
5212;
5213; @param 1 The instruction
5214;
5215; @param A0 Pointer to the first media register sized operand (output).
5216; @param A1 Pointer to the first media register sized operand (input).
5217; @param A2 Pointer to the second media register sized operand (input).
5218; @param A3 Pointer to the media register sized mask value (input).
5219%macro IEMIMPL_AVX_P_BLEND 1
5220BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5221 PROLOGUE_4_ARGS
5222 IEMIMPL_AVX_PROLOGUE
5223
5224 vmovdqu xmm0, [A1]
5225 vmovdqu xmm1, [A2]
5226 vmovdqu xmm2, [A3]
5227 %1 xmm0, xmm0, xmm1, xmm2
5228 vmovdqu [A0], xmm0
5229
5230 IEMIMPL_AVX_PROLOGUE
5231 EPILOGUE_4_ARGS
5232ENDPROC iemAImpl_ %+ %1 %+ _u128
5233
5234BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5235 PROLOGUE_4_ARGS
5236 IEMIMPL_AVX_PROLOGUE
5237
5238 vmovdqu ymm0, [A1]
5239 vmovdqu ymm1, [A2]
5240 vmovdqu ymm2, [A3]
5241 %1 ymm0, ymm0, ymm1, ymm2
5242 vmovdqu [A0], ymm0
5243
5244 IEMIMPL_AVX_PROLOGUE
5245 EPILOGUE_4_ARGS
5246ENDPROC iemAImpl_ %+ %1 %+ _u256
5247%endmacro
5248
5249IEMIMPL_AVX_P_BLEND vpblendvb
5250IEMIMPL_AVX_P_BLEND vblendvps
5251IEMIMPL_AVX_P_BLEND vblendvpd
5252
5253
5254;;
5255; palignr mm1, mm2/m64 instruction.
5256;
5257; @param A0 Pointer to the first media register sized operand (output).
5258; @param A1 The second register sized operand (input).
5259; @param A2 The 8-bit immediate.
5260BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5261 PROLOGUE_3_ARGS
5262 IEMIMPL_MMX_PROLOGUE
5263
5264 movzx A2, A2_8 ; must clear top bits
5265 movq mm0, [A0]
5266 movq mm1, A1
5267 lea T1, [.imm0 xWrtRIP]
5268 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5269 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5270 %else
5271 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5272 %endif
5273 lea T1, [T1 + T0*2]
5274 IBT_NOTRACK
5275 call T1
5276 movq [A0], mm0
5277
5278 IEMIMPL_MMX_EPILOGUE
5279 EPILOGUE_3_ARGS
5280 %assign bImm 0
5281 %rep 256
5282.imm %+ bImm:
5283 IBT_ENDBRxx_WITHOUT_NOTRACK
5284 palignr mm0, mm1, bImm
5285 ret
5286 %assign bImm bImm + 1
5287 %endrep
5288.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5289ENDPROC iemAImpl_palignr_u64
5290
5291
5292;;
5293; SSE instructions with 8-bit immediates of the form
5294; xxx xmm1, xmm2, imm8.
5295; where the instruction encoding takes up 6 bytes.
5296;
5297; @param 1 The instruction name.
5298;
5299; @param A0 Pointer to the first media register size operand (input/output).
5300; @param A1 Pointer to the second source media register size operand (input).
5301; @param A2 The 8-bit immediate
5302;
5303%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5305 PROLOGUE_3_ARGS
5306 IEMIMPL_SSE_PROLOGUE
5307
5308 movzx A2, A2_8 ; must clear top bits
5309 movdqu xmm0, [A0]
5310 movdqu xmm1, [A1]
5311 lea T1, [.imm0 xWrtRIP]
5312 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5313 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5314 lea T1, [T1 + T0*4]
5315 %else
5316 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5317 %endif
5318 IBT_NOTRACK
5319 call T1
5320 movdqu [A0], xmm0
5321
5322 IEMIMPL_SSE_EPILOGUE
5323 EPILOGUE_3_ARGS
5324 %assign bImm 0
5325 %rep 256
5326.imm %+ bImm:
5327 IBT_ENDBRxx_WITHOUT_NOTRACK
5328 %1 xmm0, xmm1, bImm
5329 ret
5330 int3
5331 %assign bImm bImm + 1
5332 %endrep
5333.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5334ENDPROC iemAImpl_ %+ %1 %+ _u128
5335%endmacro
5336
5337IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5338IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5339IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5340IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5341IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5342IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5343IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5344
5345
5346;;
5347; AVX instructions with 8-bit immediates of the form
5348; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5349; where the instruction encoding takes up 6 bytes.
5350;
5351; @param 1 The instruction name.
5352; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5353; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5354;
5355; @param A0 Pointer to the destination media register size operand (output).
5356; @param A1 Pointer to the first source media register size operand (input).
5357; @param A2 Pointer to the second source media register size operand (input).
5358; @param A3 The 8-bit immediate
5359;
5360%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5361 %if %2 == 1
5362BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5363 PROLOGUE_4_ARGS
5364 IEMIMPL_AVX_PROLOGUE
5365
5366 movzx A3, A3_8 ; must clear top bits
5367 movdqu xmm0, [A1]
5368 movdqu xmm1, [A2]
5369 lea T1, [.imm0 xWrtRIP]
5370 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5371 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5372 lea T1, [T1 + T0*4]
5373 %else
5374 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5375 %endif
5376 IBT_NOTRACK
5377 call T1
5378 movdqu [A0], xmm0
5379
5380 IEMIMPL_AVX_EPILOGUE
5381 EPILOGUE_4_ARGS
5382 %assign bImm 0
5383 %rep 256
5384.imm %+ bImm:
5385 IBT_ENDBRxx_WITHOUT_NOTRACK
5386 %1 xmm0, xmm0, xmm1, bImm
5387 ret
5388 int3
5389 %assign bImm bImm + 1
5390 %endrep
5391.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5392ENDPROC iemAImpl_ %+ %1 %+ _u128
5393 %endif
5394
5395 %if %3 == 1
5396BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5397 PROLOGUE_4_ARGS
5398 IEMIMPL_AVX_PROLOGUE
5399
5400 movzx A3, A3_8 ; must clear top bits
5401 vmovdqu ymm0, [A1]
5402 vmovdqu ymm1, [A2]
5403 lea T1, [.imm0 xWrtRIP]
5404 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5405 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5406 lea T1, [T1 + T0*4]
5407 %else
5408 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5409 %endif
5410 IBT_NOTRACK
5411 call T1
5412 vmovdqu [A0], ymm0
5413
5414 IEMIMPL_AVX_EPILOGUE
5415 EPILOGUE_4_ARGS
5416 %assign bImm 0
5417 %rep 256
5418.imm %+ bImm:
5419 IBT_ENDBRxx_WITHOUT_NOTRACK
5420 %1 ymm0, ymm0, ymm1, bImm
5421 ret
5422 int3
5423 %assign bImm bImm + 1
5424 %endrep
5425.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5426ENDPROC iemAImpl_ %+ %1 %+ _u256
5427 %endif
5428%endmacro
5429
5430IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5431IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5432IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5433IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5434IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5435IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5436IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5437IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5438
5439
5440;;
5441; Need to move this as well somewhere better?
5442;
5443struc IEMPCMPISTRXSRC
5444 .uSrc1 resd 4
5445 .uSrc2 resd 4
5446endstruc
5447
5448struc IEMPCMPESTRXSRC
5449 .uSrc1 resd 4
5450 .uSrc2 resd 4
5451 .u64Rax resd 2
5452 .u64Rdx resd 2
5453endstruc
5454
5455;;
5456; The pcmpistri instruction.
5457;
5458; @param A0 Pointer to the ECX register to store the result to (output).
5459; @param A1 Pointer to the EFLAGS register.
5460; @param A2 Pointer to the structure containing the source operands (input).
5461; @param A3 The 8-bit immediate
5462;
5463BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5464 PROLOGUE_4_ARGS
5465 IEMIMPL_SSE_PROLOGUE
5466
5467 movzx A3, A3_8 ; must clear top bits
5468 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5469 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5470 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5471 lea T1, [.imm0 xWrtRIP]
5472 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5473 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5474 lea T1, [T1 + T0*4]
5475 %else
5476 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5477 %endif
5478 IBT_NOTRACK
5479 call T1
5480
5481 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5482 mov [T2], ecx
5483
5484 IEMIMPL_SSE_EPILOGUE
5485 EPILOGUE_4_ARGS
5486 %assign bImm 0
5487 %rep 256
5488.imm %+ bImm:
5489 IBT_ENDBRxx_WITHOUT_NOTRACK
5490 pcmpistri xmm0, xmm1, bImm
5491 ret
5492 int3
5493 %assign bImm bImm + 1
5494 %endrep
5495.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5496ENDPROC iemAImpl_pcmpistri_u128
5497
5498;;
5499; The pcmpestri instruction.
5500;
5501; @param A0 Pointer to the ECX register to store the result to (output).
5502; @param A1 Pointer to the EFLAGS register.
5503; @param A2 Pointer to the structure containing the source operands (input).
5504; @param A3 The 8-bit immediate
5505;
5506BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5507 PROLOGUE_4_ARGS
5508 IEMIMPL_SSE_PROLOGUE
5509
5510 movzx A3, A3_8 ; must clear top bits
5511 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5512 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5513 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5514 lea T1, [.imm0 xWrtRIP]
5515 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5516 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5517 lea T1, [T1 + T0*4]
5518 %else
5519 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5520 %endif
5521 push xDX ; xDX can be A1 or A2 depending on the calling convention
5522 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5523 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5524 IBT_NOTRACK
5525 call T1
5526
5527 pop xDX
5528 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5529 mov [T2], ecx
5530
5531 IEMIMPL_SSE_EPILOGUE
5532 EPILOGUE_4_ARGS
5533 %assign bImm 0
5534 %rep 256
5535.imm %+ bImm:
5536 IBT_ENDBRxx_WITHOUT_NOTRACK
5537 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5538 pcmpestri xmm0, xmm1, bImm
5539 ret
5540 %assign bImm bImm + 1
5541 %endrep
5542.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5543ENDPROC iemAImpl_pcmpestri_u128
5544
5545;;
5546; The pcmpistrm instruction template.
5547;
5548; @param A0 Pointer to the XMM0 register to store the result to (output).
5549; @param A1 Pointer to the EFLAGS register.
5550; @param A2 Pointer to the structure containing the source operands (input).
5551; @param A3 The 8-bit immediate
5552;
5553BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5554 PROLOGUE_4_ARGS
5555 IEMIMPL_SSE_PROLOGUE
5556
5557 movzx A3, A3_8 ; must clear top bits
5558 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5559 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5560 lea T1, [.imm0 xWrtRIP]
5561 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5562 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5563 lea T1, [T1 + T0*4]
5564 %else
5565 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5566 %endif
5567 IBT_NOTRACK
5568 call T1
5569
5570 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5571 movdqu [A0], xmm0
5572
5573 IEMIMPL_SSE_EPILOGUE
5574 EPILOGUE_4_ARGS
5575 %assign bImm 0
5576 %rep 256
5577.imm %+ bImm:
5578 IBT_ENDBRxx_WITHOUT_NOTRACK
5579 pcmpistrm xmm1, xmm2, bImm
5580 ret
5581 int3
5582 %assign bImm bImm + 1
5583 %endrep
5584.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5585ENDPROC iemAImpl_pcmpistrm_u128
5586
5587;;
5588; The pcmpestrm instruction template.
5589;
5590; @param A0 Pointer to the XMM0 register to store the result to (output).
5591; @param A1 Pointer to the EFLAGS register.
5592; @param A2 Pointer to the structure containing the source operands (input).
5593; @param A3 The 8-bit immediate
5594;
5595BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5596 PROLOGUE_4_ARGS
5597 IEMIMPL_SSE_PROLOGUE
5598
5599 movzx A3, A3_8 ; must clear top bits
5600 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5601 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5602 lea T1, [.imm0 xWrtRIP]
5603 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5604 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5605 lea T1, [T1 + T0*4]
5606 %else
5607 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5608 %endif
5609 push xDX ; xDX can be A1 or A2 depending on the calling convention
5610 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5611 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5612 IBT_NOTRACK
5613 call T1
5614
5615 pop xDX
5616 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5617 movdqu [A0], xmm0
5618
5619 IEMIMPL_SSE_EPILOGUE
5620 EPILOGUE_4_ARGS
5621 %assign bImm 0
5622 %rep 256
5623.imm %+ bImm:
5624 IBT_ENDBRxx_WITHOUT_NOTRACK
5625 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5626 pcmpestrm xmm1, xmm2, bImm
5627 ret
5628 %assign bImm bImm + 1
5629 %endrep
5630.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5631ENDPROC iemAImpl_pcmpestrm_u128
5632
5633
5634;;
5635; pinsrw instruction.
5636;
5637; @param A0 Pointer to the first media register size operand (input/output).
5638; @param A1 The 16 bit input operand (input).
5639; @param A2 The 8-bit immediate
5640;
5641BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5642 PROLOGUE_3_ARGS
5643 IEMIMPL_SSE_PROLOGUE
5644
5645 movzx A2, A2_8 ; must clear top bits
5646 movq mm0, [A0]
5647 lea T1, [.imm0 xWrtRIP]
5648 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5649 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5650 %else
5651 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5652 %endif
5653 lea T1, [T1 + T0]
5654 IBT_NOTRACK
5655 call T1
5656 movq [A0], mm0
5657
5658 IEMIMPL_SSE_EPILOGUE
5659 EPILOGUE_3_ARGS
5660 %assign bImm 0
5661 %rep 256
5662.imm %+ bImm:
5663 IBT_ENDBRxx_WITHOUT_NOTRACK
5664 pinsrw mm0, A1_32, bImm
5665 ret
5666 %assign bImm bImm + 1
5667 %endrep
5668.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5669ENDPROC iemAImpl_pinsrw_u64
5670
5671BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5672 PROLOGUE_3_ARGS
5673 IEMIMPL_SSE_PROLOGUE
5674
5675 movzx A2, A2_8 ; must clear top bits
5676 movdqu xmm0, [A0]
5677 lea T1, [.imm0 xWrtRIP]
5678 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5679 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5680 %else
5681 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5682 %endif
5683 lea T1, [T1 + T0*2]
5684 IBT_NOTRACK
5685 call T1
5686 movdqu [A0], xmm0
5687
5688 IEMIMPL_SSE_EPILOGUE
5689 EPILOGUE_3_ARGS
5690 %assign bImm 0
5691 %rep 256
5692.imm %+ bImm:
5693 IBT_ENDBRxx_WITHOUT_NOTRACK
5694 pinsrw xmm0, A1_32, bImm
5695 ret
5696 %assign bImm bImm + 1
5697 %endrep
5698.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5699ENDPROC iemAImpl_pinsrw_u128
5700
5701;;
5702; vpinsrw instruction.
5703;
5704; @param A0 Pointer to the first media register size operand (output).
5705; @param A1 Pointer to the source media register size operand (input).
5706; @param A2 The 16 bit input operand (input).
5707; @param A3 The 8-bit immediate
5708;
5709BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5710 PROLOGUE_4_ARGS
5711 IEMIMPL_SSE_PROLOGUE
5712
5713 movzx A3, A3_8 ; must clear top bits
5714 movdqu xmm0, [A1]
5715 lea T1, [.imm0 xWrtRIP]
5716 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5717 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5718 %else
5719 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5720 %endif
5721 lea T1, [T1 + T0*2]
5722 mov A1, A2 ; A2 requires longer encoding on Windows
5723 IBT_NOTRACK
5724 call T1
5725 movdqu [A0], xmm0
5726
5727 IEMIMPL_SSE_EPILOGUE
5728 EPILOGUE_4_ARGS
5729 %assign bImm 0
5730 %rep 256
5731.imm %+ bImm:
5732 IBT_ENDBRxx_WITHOUT_NOTRACK
5733 vpinsrw xmm0, xmm0, A1_32, bImm
5734 ret
5735 %assign bImm bImm + 1
5736 %endrep
5737.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5738ENDPROC iemAImpl_vpinsrw_u128
5739
5740
5741;;
5742; pextrw instruction.
5743;
5744; @param A0 Pointer to the 16bit output operand (output).
5745; @param A1 Pointer to the media register size operand (input).
5746; @param A2 The 8-bit immediate
5747;
5748BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5749 PROLOGUE_3_ARGS
5750 IEMIMPL_SSE_PROLOGUE
5751
5752 movzx A2, A2_8 ; must clear top bits
5753 movq mm0, A1
5754 lea T1, [.imm0 xWrtRIP]
5755 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5756 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5757 %else
5758 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5759 %endif
5760 lea T1, [T1 + T0]
5761 IBT_NOTRACK
5762 call T1
5763 mov word [A0], T0_16
5764
5765 IEMIMPL_SSE_EPILOGUE
5766 EPILOGUE_3_ARGS
5767 %assign bImm 0
5768 %rep 256
5769.imm %+ bImm:
5770 IBT_ENDBRxx_WITHOUT_NOTRACK
5771 pextrw T0_32, mm0, bImm
5772 ret
5773 %assign bImm bImm + 1
5774 %endrep
5775.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5776ENDPROC iemAImpl_pextrw_u64
5777
5778BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5779 PROLOGUE_3_ARGS
5780 IEMIMPL_SSE_PROLOGUE
5781
5782 movzx A2, A2_8 ; must clear top bits
5783 movdqu xmm0, [A1]
5784 lea T1, [.imm0 xWrtRIP]
5785 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5786 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5787 %else
5788 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5789 %endif
5790 lea T1, [T1 + T0*2]
5791 IBT_NOTRACK
5792 call T1
5793 mov word [A0], T0_16
5794
5795 IEMIMPL_SSE_EPILOGUE
5796 EPILOGUE_3_ARGS
5797 %assign bImm 0
5798 %rep 256
5799.imm %+ bImm:
5800 IBT_ENDBRxx_WITHOUT_NOTRACK
5801 pextrw T0_32, xmm0, bImm
5802 ret
5803 %assign bImm bImm + 1
5804 %endrep
5805.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5806ENDPROC iemAImpl_pextrw_u128
5807
5808;;
5809; vpextrw instruction.
5810;
5811; @param A0 Pointer to the 16bit output operand (output).
5812; @param A1 Pointer to the source media register size operand (input).
5813; @param A2 The 8-bit immediate
5814;
5815BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5816 PROLOGUE_3_ARGS
5817 IEMIMPL_SSE_PROLOGUE
5818
5819 movzx A2, A2_8 ; must clear top bits
5820 movdqu xmm0, [A1]
5821 lea T1, [.imm0 xWrtRIP]
5822 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5823 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5824 %else
5825 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5826 %endif
5827 lea T1, [T1 + T0*2]
5828 IBT_NOTRACK
5829 call T1
5830 mov word [A0], T0_16
5831
5832 IEMIMPL_SSE_EPILOGUE
5833 EPILOGUE_3_ARGS
5834 %assign bImm 0
5835 %rep 256
5836.imm %+ bImm:
5837 IBT_ENDBRxx_WITHOUT_NOTRACK
5838 vpextrw T0_32, xmm0, bImm
5839 ret
5840 %assign bImm bImm + 1
5841 %endrep
5842.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5843ENDPROC iemAImpl_vpextrw_u128
5844
5845
5846;;
5847; movmskp{s,d} SSE instruction template
5848;
5849; @param 1 The SSE instruction name.
5850; @param 2 The AVX instruction name.
5851;
5852; @param A0 Pointer to the output register (output/byte sized).
5853; @param A1 Pointer to the source media register size operand (input).
5854;
5855%macro IEMIMPL_MEDIA_MOVMSK_P 2
5856BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5857 PROLOGUE_2_ARGS
5858 IEMIMPL_SSE_PROLOGUE
5859
5860 movdqu xmm0, [A1]
5861 %1 T0, xmm0
5862 mov byte [A0], T0_8
5863
5864 IEMIMPL_SSE_EPILOGUE
5865 EPILOGUE_2_ARGS
5866ENDPROC iemAImpl_ %+ %1 %+ _u128
5867
5868BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5869 PROLOGUE_2_ARGS
5870 IEMIMPL_AVX_PROLOGUE
5871
5872 movdqu xmm0, [A1]
5873 %2 T0, xmm0
5874 mov byte [A0], T0_8
5875
5876 IEMIMPL_AVX_EPILOGUE
5877 EPILOGUE_2_ARGS
5878ENDPROC iemAImpl_ %+ %2 %+ _u128
5879
5880BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5881 PROLOGUE_2_ARGS
5882 IEMIMPL_AVX_PROLOGUE
5883
5884 vmovdqu ymm0, [A1]
5885 %2 T0, ymm0
5886 mov byte [A0], T0_8
5887
5888 IEMIMPL_AVX_EPILOGUE
5889 EPILOGUE_2_ARGS
5890ENDPROC iemAImpl_ %+ %2 %+ _u256
5891%endmacro
5892
5893IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5894IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5895
5896
5897;;
5898; Restores the SSE MXCSR register with the original value.
5899;
5900; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5901; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5902; @param 2 Expression giving the address of the FXSTATE of the guest.
5903;
5904; @note Restores the stack pointer.
5905;
5906%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5907 sub xSP, 4
5908 stmxcsr [xSP]
5909 mov T0_32, [xSP]
5910 add xSP, 4
5911 ; Merge the status bits into the original MXCSR value.
5912 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5913 and T0_32, X86_MXCSR_XCPT_FLAGS
5914 or T0_32, T1_32
5915 mov [%1], T0_32
5916
5917 ldmxcsr [xSP]
5918 add xSP, 4
5919%endmacro
5920
5921
5922;;
5923; cvttsd2si instruction - 32-bit variant.
5924;
5925; @param A0 FPU context (FXSTATE or XSAVEAREA).
5926; @param A1 Where to return the MXCSR value.
5927; @param A2 Pointer to the result operand (output).
5928; @param A3 Pointer to the second operand (input).
5929;
5930BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5931 PROLOGUE_4_ARGS
5932 IEMIMPL_SSE_PROLOGUE
5933 SSE_LD_FXSTATE_MXCSR A0
5934
5935 cvttsd2si T0_32, [A3]
5936 mov dword [A2], T0_32
5937
5938 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5939 IEMIMPL_SSE_EPILOGUE
5940 EPILOGUE_4_ARGS
5941ENDPROC iemAImpl_cvttsd2si_i32_r64
5942
5943;;
5944; cvttsd2si instruction - 64-bit variant.
5945;
5946; @param A0 FPU context (FXSTATE or XSAVEAREA).
5947; @param A1 Where to return the MXCSR value.
5948; @param A2 Pointer to the result operand (output).
5949; @param A3 Pointer to the second operand (input).
5950;
5951BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5952 PROLOGUE_4_ARGS
5953 IEMIMPL_SSE_PROLOGUE
5954 SSE_LD_FXSTATE_MXCSR A0
5955
5956 cvttsd2si T0, [A3]
5957 mov qword [A2], T0
5958
5959 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5960 IEMIMPL_SSE_EPILOGUE
5961 EPILOGUE_4_ARGS
5962ENDPROC iemAImpl_cvttsd2si_i64_r64
5963
5964
5965;;
5966; cvtsd2si instruction - 32-bit variant.
5967;
5968; @param A0 FPU context (FXSTATE or XSAVEAREA).
5969; @param A1 Where to return the MXCSR value.
5970; @param A2 Pointer to the result operand (output).
5971; @param A3 Pointer to the second operand (input).
5972;
5973BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5974 PROLOGUE_4_ARGS
5975 IEMIMPL_SSE_PROLOGUE
5976 SSE_LD_FXSTATE_MXCSR A0
5977
5978 cvtsd2si T0_32, [A3]
5979 mov dword [A2], T0_32
5980
5981 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5982 IEMIMPL_SSE_EPILOGUE
5983 EPILOGUE_4_ARGS
5984ENDPROC iemAImpl_cvtsd2si_i32_r64
5985
5986;;
5987; cvtsd2si instruction - 64-bit variant.
5988;
5989; @param A0 FPU context (FXSTATE or XSAVEAREA).
5990; @param A1 Where to return the MXCSR value.
5991; @param A2 Pointer to the result operand (output).
5992; @param A3 Pointer to the second operand (input).
5993;
5994BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5995 PROLOGUE_4_ARGS
5996 IEMIMPL_SSE_PROLOGUE
5997 SSE_LD_FXSTATE_MXCSR A0
5998
5999 cvtsd2si T0, [A3]
6000 mov qword [A2], T0
6001
6002 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6003 IEMIMPL_SSE_EPILOGUE
6004 EPILOGUE_4_ARGS
6005ENDPROC iemAImpl_cvtsd2si_i64_r64
6006
6007
6008;;
6009; cvttss2si instruction - 32-bit variant.
6010;
6011; @param A0 FPU context (FXSTATE or XSAVEAREA).
6012; @param A1 Where to return the MXCSR value.
6013; @param A2 Pointer to the result operand (output).
6014; @param A3 Pointer to the second operand (input).
6015;
6016BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6017 PROLOGUE_4_ARGS
6018 IEMIMPL_SSE_PROLOGUE
6019 SSE_LD_FXSTATE_MXCSR A0
6020
6021 cvttss2si T0_32, [A3]
6022 mov dword [A2], T0_32
6023
6024 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6025 IEMIMPL_SSE_EPILOGUE
6026 EPILOGUE_4_ARGS
6027ENDPROC iemAImpl_cvttss2si_i32_r32
6028
6029;;
6030; cvttss2si instruction - 64-bit variant.
6031;
6032; @param A0 FPU context (FXSTATE or XSAVEAREA).
6033; @param A1 Where to return the MXCSR value.
6034; @param A2 Pointer to the result operand (output).
6035; @param A3 Pointer to the second operand (input).
6036;
6037BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6038 PROLOGUE_4_ARGS
6039 IEMIMPL_SSE_PROLOGUE
6040 SSE_LD_FXSTATE_MXCSR A0
6041
6042 cvttss2si T0, [A3]
6043 mov qword [A2], T0
6044
6045 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6046 IEMIMPL_SSE_EPILOGUE
6047 EPILOGUE_4_ARGS
6048ENDPROC iemAImpl_cvttss2si_i64_r32
6049
6050
6051;;
6052; cvtss2si instruction - 32-bit variant.
6053;
6054; @param A0 FPU context (FXSTATE or XSAVEAREA).
6055; @param A1 Where to return the MXCSR value.
6056; @param A2 Pointer to the result operand (output).
6057; @param A3 Pointer to the second operand (input).
6058;
6059BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6060 PROLOGUE_4_ARGS
6061 IEMIMPL_SSE_PROLOGUE
6062 SSE_LD_FXSTATE_MXCSR A0
6063
6064 cvtss2si T0_32, [A3]
6065 mov dword [A2], T0_32
6066
6067 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6068 IEMIMPL_SSE_EPILOGUE
6069 EPILOGUE_4_ARGS
6070ENDPROC iemAImpl_cvtss2si_i32_r32
6071
6072;;
6073; cvtss2si instruction - 64-bit variant.
6074;
6075; @param A0 FPU context (FXSTATE or XSAVEAREA).
6076; @param A1 Where to return the MXCSR value.
6077; @param A2 Pointer to the result operand (output).
6078; @param A3 Pointer to the second operand (input).
6079;
6080BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6081 PROLOGUE_4_ARGS
6082 IEMIMPL_SSE_PROLOGUE
6083 SSE_LD_FXSTATE_MXCSR A0
6084
6085 cvtss2si T0, [A3]
6086 mov qword [A2], T0
6087
6088 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6089 IEMIMPL_SSE_EPILOGUE
6090 EPILOGUE_4_ARGS
6091ENDPROC iemAImpl_cvtss2si_i64_r32
6092
6093
6094;;
6095; cvtsi2ss instruction - 32-bit variant.
6096;
6097; @param A0 FPU context (FXSTATE or XSAVEAREA).
6098; @param A1 Where to return the MXCSR value.
6099; @param A2 Pointer to the result operand (output).
6100; @param A3 Pointer to the second operand (input).
6101;
6102BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6103 PROLOGUE_4_ARGS
6104 IEMIMPL_SSE_PROLOGUE
6105 SSE_LD_FXSTATE_MXCSR A0
6106
6107 cvtsi2ss xmm0, dword [A3]
6108 movd dword [A2], xmm0
6109
6110 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6111 IEMIMPL_SSE_EPILOGUE
6112 EPILOGUE_4_ARGS
6113ENDPROC iemAImpl_cvtsi2ss_r32_i32
6114
6115;;
6116; cvtsi2ss instruction - 64-bit variant.
6117;
6118; @param A0 FPU context (FXSTATE or XSAVEAREA).
6119; @param A1 Where to return the MXCSR value.
6120; @param A2 Pointer to the result operand (output).
6121; @param A3 Pointer to the second operand (input).
6122;
6123BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6124 PROLOGUE_4_ARGS
6125 IEMIMPL_SSE_PROLOGUE
6126 SSE_LD_FXSTATE_MXCSR A0
6127
6128 cvtsi2ss xmm0, qword [A3]
6129 movd dword [A2], xmm0
6130
6131 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6132 IEMIMPL_SSE_EPILOGUE
6133 EPILOGUE_4_ARGS
6134ENDPROC iemAImpl_cvtsi2ss_r32_i64
6135
6136
6137;;
6138; cvtsi2sd instruction - 32-bit variant.
6139;
6140; @param A0 FPU context (FXSTATE or XSAVEAREA).
6141; @param A1 Where to return the MXCSR value.
6142; @param A2 Pointer to the result operand (output).
6143; @param A3 Pointer to the second operand (input).
6144;
6145BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6146 PROLOGUE_4_ARGS
6147 IEMIMPL_SSE_PROLOGUE
6148 SSE_LD_FXSTATE_MXCSR A0
6149
6150 cvtsi2sd xmm0, dword [A3]
6151 movq [A2], xmm0
6152
6153 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6154 IEMIMPL_SSE_EPILOGUE
6155 EPILOGUE_4_ARGS
6156ENDPROC iemAImpl_cvtsi2sd_r64_i32
6157
6158;;
6159; cvtsi2sd instruction - 64-bit variant.
6160;
6161; @param A0 FPU context (FXSTATE or XSAVEAREA).
6162; @param A1 Where to return the MXCSR value.
6163; @param A2 Pointer to the result operand (output).
6164; @param A3 Pointer to the second operand (input).
6165;
6166BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6167 PROLOGUE_4_ARGS
6168 IEMIMPL_SSE_PROLOGUE
6169 SSE_LD_FXSTATE_MXCSR A0
6170
6171 cvtsi2sd xmm0, qword [A3]
6172 movq [A2], xmm0
6173
6174 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6175 IEMIMPL_SSE_EPILOGUE
6176 EPILOGUE_4_ARGS
6177ENDPROC iemAImpl_cvtsi2sd_r64_i64
6178
6179
6180;;
6181; Initialize the SSE MXCSR register using the guest value partially to
6182; account for rounding mode.
6183;
6184; @uses 4 bytes of stack to save the original value, T0.
6185; @param 1 Expression giving the address of the MXCSR register of the guest.
6186;
6187%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6188 sub xSP, 4
6189
6190 stmxcsr [xSP]
6191 mov T0_32, [%1]
6192 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6193 or T0_32, X86_MXCSR_XCPT_MASK
6194 sub xSP, 4
6195 mov [xSP], T0_32
6196 ldmxcsr [xSP]
6197 add xSP, 4
6198%endmacro
6199
6200
6201;;
6202; Restores the SSE MXCSR register with the original value.
6203;
6204; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6205; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6206;
6207; @note Restores the stack pointer.
6208;
6209%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6210 sub xSP, 4
6211 stmxcsr [xSP]
6212 mov T0_32, [xSP]
6213 add xSP, 4
6214 ; Merge the status bits into the original MXCSR value.
6215 mov T1_32, [%1]
6216 and T0_32, X86_MXCSR_XCPT_FLAGS
6217 or T0_32, T1_32
6218 mov [%1], T0_32
6219
6220 ldmxcsr [xSP]
6221 add xSP, 4
6222%endmacro
6223
6224
6225;
6226; UCOMISS (SSE)
6227;
6228; @param A0 Pointer to the MXCSR value (input/output).
6229; @param A1 Pointer to the EFLAGS value (input/output).
6230; @param A2 Pointer to the first source operand (aka readonly destination).
6231; @param A3 Pointer to the second source operand.
6232;
6233BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6234 PROLOGUE_4_ARGS
6235 IEMIMPL_SSE_PROLOGUE
6236 SSE_LD_FXSTATE_MXCSR_ONLY A0
6237
6238 movdqu xmm0, [A2]
6239 movdqu xmm1, [A3]
6240 ucomiss xmm0, xmm1
6241 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6242
6243 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6244 IEMIMPL_SSE_EPILOGUE
6245 EPILOGUE_4_ARGS
6246ENDPROC iemAImpl_ucomiss_u128
6247
6248BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6249 PROLOGUE_4_ARGS
6250 IEMIMPL_SSE_PROLOGUE
6251 SSE_LD_FXSTATE_MXCSR_ONLY A0
6252
6253 movdqu xmm0, [A2]
6254 movdqu xmm1, [A3]
6255 vucomiss xmm0, xmm1
6256 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6257
6258 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6259 IEMIMPL_SSE_EPILOGUE
6260 EPILOGUE_4_ARGS
6261ENDPROC iemAImpl_vucomiss_u128
6262
6263
6264;
6265; UCOMISD (SSE)
6266;
6267; @param A0 Pointer to the MXCSR value (input/output).
6268; @param A1 Pointer to the EFLAGS value (input/output).
6269; @param A2 Pointer to the first source operand (aka readonly destination).
6270; @param A3 Pointer to the second source operand.
6271;
6272BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6273 PROLOGUE_4_ARGS
6274 IEMIMPL_SSE_PROLOGUE
6275 SSE_LD_FXSTATE_MXCSR_ONLY A0
6276
6277 movdqu xmm0, [A2]
6278 movdqu xmm1, [A3]
6279 ucomisd xmm0, xmm1
6280 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6281
6282 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6283 IEMIMPL_SSE_EPILOGUE
6284 EPILOGUE_4_ARGS
6285ENDPROC iemAImpl_ucomisd_u128
6286
6287BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6288 PROLOGUE_4_ARGS
6289 IEMIMPL_SSE_PROLOGUE
6290 SSE_LD_FXSTATE_MXCSR_ONLY A0
6291
6292 movdqu xmm0, [A2]
6293 movdqu xmm1, [A3]
6294 vucomisd xmm0, xmm1
6295 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6296
6297 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6298 IEMIMPL_SSE_EPILOGUE
6299 EPILOGUE_4_ARGS
6300ENDPROC iemAImpl_vucomisd_u128
6301
6302;
6303; COMISS (SSE)
6304;
6305; @param A0 Pointer to the MXCSR value (input/output).
6306; @param A1 Pointer to the EFLAGS value (input/output).
6307; @param A2 Pointer to the first source operand (aka readonly destination).
6308; @param A3 Pointer to the second source operand.
6309;
6310BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6311 PROLOGUE_4_ARGS
6312 IEMIMPL_SSE_PROLOGUE
6313 SSE_LD_FXSTATE_MXCSR_ONLY A0
6314
6315 movdqu xmm0, [A2]
6316 movdqu xmm1, [A3]
6317 comiss xmm0, xmm1
6318 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6319
6320 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6321 IEMIMPL_SSE_EPILOGUE
6322 EPILOGUE_4_ARGS
6323ENDPROC iemAImpl_comiss_u128
6324
6325BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6326 PROLOGUE_4_ARGS
6327 IEMIMPL_SSE_PROLOGUE
6328 SSE_LD_FXSTATE_MXCSR_ONLY A0
6329
6330 movdqu xmm0, [A2]
6331 movdqu xmm1, [A3]
6332 vcomiss xmm0, xmm1
6333 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6334
6335 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6336 IEMIMPL_SSE_EPILOGUE
6337 EPILOGUE_4_ARGS
6338ENDPROC iemAImpl_vcomiss_u128
6339
6340
6341;
6342; COMISD (SSE)
6343;
6344; @param A0 Pointer to the MXCSR value (input/output).
6345; @param A1 Pointer to the EFLAGS value (input/output).
6346; @param A2 Pointer to the first source operand (aka readonly destination).
6347; @param A3 Pointer to the second source operand.
6348;
6349BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6350 PROLOGUE_4_ARGS
6351 IEMIMPL_SSE_PROLOGUE
6352 SSE_LD_FXSTATE_MXCSR_ONLY A0
6353
6354 movdqu xmm0, [A2]
6355 movdqu xmm1, [A3]
6356 comisd xmm0, xmm1
6357 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6358
6359 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6360 IEMIMPL_SSE_EPILOGUE
6361 EPILOGUE_4_ARGS
6362ENDPROC iemAImpl_comisd_u128
6363
6364BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6365 PROLOGUE_4_ARGS
6366 IEMIMPL_SSE_PROLOGUE
6367 SSE_LD_FXSTATE_MXCSR_ONLY A0
6368
6369 movdqu xmm0, [A2]
6370 movdqu xmm1, [A3]
6371 vcomisd xmm0, xmm1
6372 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6373
6374 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6375 IEMIMPL_SSE_EPILOGUE
6376 EPILOGUE_4_ARGS
6377ENDPROC iemAImpl_vcomisd_u128
6378
6379
6380;;
6381; Need to move this as well somewhere better?
6382;
6383struc IEMMEDIAF2XMMSRC
6384 .uSrc1 resd 4
6385 .uSrc2 resd 4
6386endstruc
6387
6388
6389;
6390; CMPPS (SSE)
6391;
6392; @param A0 Pointer to the MXCSR value (input/output).
6393; @param A1 Pointer to the first media register size operand (output).
6394; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6395; @param A3 The 8-bit immediate (input).
6396;
6397BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6398 PROLOGUE_4_ARGS
6399 IEMIMPL_SSE_PROLOGUE
6400 SSE_LD_FXSTATE_MXCSR_ONLY A0
6401
6402 movzx A3, A3_8 ; must clear top bits
6403 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6404 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6405 lea T1, [.imm0 xWrtRIP]
6406 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6407 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6408 %else
6409 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6410 %endif
6411 lea T1, [T1 + T0]
6412 IBT_NOTRACK
6413 call T1
6414 movdqu [A1], xmm0
6415
6416 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6417 IEMIMPL_SSE_EPILOGUE
6418 EPILOGUE_4_ARGS
6419 %assign bImm 0
6420 %rep 256
6421.imm %+ bImm:
6422 IBT_ENDBRxx_WITHOUT_NOTRACK
6423 cmpps xmm0, xmm1, bImm
6424 ret
6425 %assign bImm bImm + 1
6426 %endrep
6427.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6428ENDPROC iemAImpl_cmpps_u128
6429
6430;;
6431; SSE instructions with 8-bit immediates of the form
6432; xxx xmm1, xmm2, imm8.
6433; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6434; register.
6435;
6436; @param 1 The instruction name.
6437;
6438; @param A0 Pointer to the MXCSR value (input/output).
6439; @param A1 Pointer to the first media register size operand (output).
6440; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6441; @param A3 The 8-bit immediate (input).
6442;
6443%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6444BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6445 PROLOGUE_4_ARGS
6446 IEMIMPL_SSE_PROLOGUE
6447 SSE_LD_FXSTATE_MXCSR_ONLY A0
6448
6449 movzx A3, A3_8 ; must clear top bits
6450 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6451 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6452 lea T1, [.imm0 xWrtRIP]
6453 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6454 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6455 %else
6456 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6457 %endif
6458 lea T1, [T1 + T0*2]
6459 IBT_NOTRACK
6460 call T1
6461 movdqu [A1], xmm0
6462
6463 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6464 IEMIMPL_SSE_EPILOGUE
6465 EPILOGUE_4_ARGS
6466 %assign bImm 0
6467 %rep 256
6468.imm %+ bImm:
6469 IBT_ENDBRxx_WITHOUT_NOTRACK
6470 %1 xmm0, xmm1, bImm
6471 ret
6472 %assign bImm bImm + 1
6473 %endrep
6474.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6475ENDPROC iemAImpl_ %+ %1 %+ _u128
6476%endmacro
6477
6478IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6479IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6480IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6481
6482;;
6483; SSE instructions with 8-bit immediates of the form
6484; xxx xmm1, xmm2, imm8.
6485; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6486; register.
6487;
6488; @param 1 The instruction name.
6489;
6490; @param A0 Pointer to the MXCSR value (input/output).
6491; @param A1 Pointer to the first media register size operand (output).
6492; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6493; @param A3 The 8-bit immediate (input).
6494;
6495%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6497 PROLOGUE_4_ARGS
6498 IEMIMPL_SSE_PROLOGUE
6499 SSE_LD_FXSTATE_MXCSR_ONLY A0
6500
6501 movzx A3, A3_8 ; must clear top bits
6502 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6503 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6504 lea T1, [.imm0 xWrtRIP]
6505 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6506 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6507 lea T1, [T1 + T0*4]
6508 %else
6509 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6510 %endif
6511 IBT_NOTRACK
6512 call T1
6513 movdqu [A1], xmm0
6514
6515 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6516 IEMIMPL_SSE_EPILOGUE
6517 EPILOGUE_4_ARGS
6518 %assign bImm 0
6519 %rep 256
6520.imm %+ bImm:
6521 IBT_ENDBRxx_WITHOUT_NOTRACK
6522 %1 xmm0, xmm1, bImm
6523 ret
6524 int3
6525 %assign bImm bImm + 1
6526 %endrep
6527.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6528ENDPROC iemAImpl_ %+ %1 %+ _u128
6529%endmacro
6530
6531IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6532IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6533IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6534IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6535IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6536IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6537
6538
6539;;
6540; SSE instructions of the form
6541; xxx mm, xmm.
6542; and we need to load and save the MXCSR register.
6543;
6544; @param 1 The instruction name.
6545;
6546; @param A0 Pointer to the MXCSR value (input/output).
6547; @param A1 Pointer to the first MMX register sized operand (output).
6548; @param A2 Pointer to the media register sized operand (input).
6549;
6550%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6552 PROLOGUE_3_ARGS
6553 IEMIMPL_SSE_PROLOGUE
6554 SSE_LD_FXSTATE_MXCSR_ONLY A0
6555
6556 movdqu xmm0, [A2]
6557 %1 mm0, xmm0
6558 movq [A1], mm0
6559
6560 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6561 IEMIMPL_SSE_EPILOGUE
6562 EPILOGUE_3_ARGS
6563ENDPROC iemAImpl_ %+ %1 %+ _u128
6564%endmacro
6565
6566IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6567IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6568
6569;;
6570; SSE instructions of the form
6571; xxx xmm, xmm/m64.
6572; and we need to load and save the MXCSR register.
6573;
6574; @param 1 The instruction name.
6575;
6576; @param A0 Pointer to the MXCSR value (input/output).
6577; @param A1 Pointer to the first media register sized operand (input/output).
6578; @param A2 The 64bit source value from a MMX media register (input)
6579;
6580%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6581BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6582 PROLOGUE_3_ARGS
6583 IEMIMPL_SSE_PROLOGUE
6584 SSE_LD_FXSTATE_MXCSR_ONLY A0
6585
6586 movdqu xmm0, [A1]
6587 movq mm0, A2
6588 %1 xmm0, mm0
6589 movdqu [A1], xmm0
6590
6591 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6592 IEMIMPL_SSE_EPILOGUE
6593 EPILOGUE_3_ARGS
6594ENDPROC iemAImpl_ %+ %1 %+ _u128
6595%endmacro
6596
6597IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6598IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6599
6600;;
6601; SSE instructions of the form
6602; xxx mm, xmm/m64.
6603; and we need to load and save the MXCSR register.
6604;
6605; @param 1 The instruction name.
6606;
6607; @param A0 Pointer to the MXCSR value (input/output).
6608; @param A1 Pointer to the first MMX media register sized operand (output).
6609; @param A2 The 64bit source value (input).
6610;
6611%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6612BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6613 PROLOGUE_3_ARGS
6614 IEMIMPL_SSE_PROLOGUE
6615 SSE_LD_FXSTATE_MXCSR_ONLY A0
6616
6617 movq xmm0, A2
6618 %1 mm0, xmm0
6619 movq [A1], mm0
6620
6621 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6622 IEMIMPL_SSE_EPILOGUE
6623 EPILOGUE_3_ARGS
6624ENDPROC iemAImpl_ %+ %1 %+ _u128
6625%endmacro
6626
6627IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6628IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6629
6630;
6631; All forms of RDRAND and RDSEED
6632;
6633; @param A0 Pointer to the destination operand.
6634; @param A1 Pointer to the EFLAGS value (input/output).
6635;
6636%macro IEMIMPL_RDRAND_RDSEED 3
6637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6638 PROLOGUE_2_ARGS
6639
6640 %1 %2
6641 mov [A0], %2
6642 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6643
6644 EPILOGUE_2_ARGS
6645ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6646%endmacro
6647
6648IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6649IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6650IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6651IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6652IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6653IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6654
6655
6656;;
6657; sha1rnds4 xmm1, xmm2, imm8.
6658;
6659; @param 1 The instruction name.
6660;
6661; @param A0 Pointer to the first media register size operand (input/output).
6662; @param A1 Pointer to the second source media register size operand (input).
6663; @param A2 The 8-bit immediate
6664;
6665BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6666 PROLOGUE_3_ARGS
6667 IEMIMPL_SSE_PROLOGUE
6668
6669 movzx A2, A2_8 ; must clear top bits
6670 movdqu xmm0, [A0]
6671 movdqu xmm1, [A1]
6672 lea T1, [.imm0 xWrtRIP]
6673 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6674 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6675 %else
6676 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6677 %endif
6678 lea T1, [T1 + T0*2]
6679 IBT_NOTRACK
6680 call T1
6681 movdqu [A0], xmm0
6682
6683 IEMIMPL_SSE_EPILOGUE
6684 EPILOGUE_3_ARGS
6685 %assign bImm 0
6686 %rep 256
6687.imm %+ bImm:
6688 IBT_ENDBRxx_WITHOUT_NOTRACK
6689 sha1rnds4 xmm0, xmm1, bImm
6690 ret
6691 %assign bImm bImm + 1
6692 %endrep
6693.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6694ENDPROC iemAImpl_sha1rnds4_u128
6695
6696
6697;;
6698; sha256rnds2 xmm1, xmm2, <XMM0>.
6699;
6700; @param 1 The instruction name.
6701;
6702; @param A0 Pointer to the first media register size operand (input/output).
6703; @param A1 Pointer to the second source media register size operand (input).
6704; @param A2 Pointer to the implicit XMM0 constants (input).
6705;
6706BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6707 PROLOGUE_3_ARGS
6708 IEMIMPL_SSE_PROLOGUE
6709
6710 movdqu xmm0, [A2]
6711 movdqu xmm1, [A0]
6712 movdqu xmm2, [A1]
6713 sha256rnds2 xmm1, xmm2
6714 movdqu [A0], xmm1
6715
6716 IEMIMPL_SSE_EPILOGUE
6717 EPILOGUE_3_ARGS
6718ENDPROC iemAImpl_sha256rnds2_u128
6719
6720
6721;
6722; 32-bit forms of ADCX and ADOX
6723;
6724; @param A0 Pointer to the destination operand (input/output).
6725; @param A1 Pointer to the EFLAGS value (input/output).
6726; @param A2 32-bit source operand 1 (input).
6727;
6728%macro IEMIMPL_ADX_32 2
6729BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6730 PROLOGUE_4_ARGS
6731
6732 IEM_LOAD_FLAGS A1, %2, 0
6733 %1 A2_32, [A0]
6734 mov [A0], A2_32
6735 IEM_SAVE_FLAGS A1, %2, 0
6736
6737 EPILOGUE_4_ARGS
6738ENDPROC iemAImpl_ %+ %1 %+ _u32
6739%endmacro
6740
6741;
6742; 64-bit forms of ADCX and ADOX
6743;
6744; @param A0 Pointer to the destination operand (input/output).
6745; @param A1 Pointer to the EFLAGS value (input/output).
6746; @param A2 64-bit source operand 1 (input).
6747;
6748%macro IEMIMPL_ADX_64 2
6749BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6750 PROLOGUE_4_ARGS
6751
6752 IEM_LOAD_FLAGS A1, %2, 0
6753 %1 A2, [A0]
6754 mov [A0], A2
6755 IEM_SAVE_FLAGS A1, %2, 0
6756
6757 EPILOGUE_4_ARGS
6758ENDPROC iemAImpl_ %+ %1 %+ _u64
6759%endmacro
6760
6761IEMIMPL_ADX_32 adcx, X86_EFL_CF
6762IEMIMPL_ADX_64 adcx, X86_EFL_CF
6763
6764IEMIMPL_ADX_32 adox, X86_EFL_OF
6765IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette