VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 104056

Last change on this file since 104056 was 104051, checked in by vboxsync, 13 months ago

VMM/IEM: Optimizing (hopefully) and correcting flag handling in IEMAImpl.asm. Prep for shl, shr, and friends. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 203.7 KB
Line 
1; $Id: IEMAllAImpl.asm 104051 2024-03-26 02:10:26Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; This is handy for generating absolutly correct EFLAGS.
277;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
278
279;;
280; Load the relevant flags from [%1] if there are undefined flags (%3).
281;
282; @remarks Clobbers T0, stack. Changes EFLAGS.
283; @param 1 The parameter (A0..A3) pointing to the eflags.
284; @param 2 The set of modified flags.
285; @param 3 The set of undefined flags.
286; @param 4 The flags that must be loaded.
287;
288%macro IEM_MAYBE_LOAD_FLAGS 4
289 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
290 pushf ; store current flags
291 mov T0_32, [%1] ; load the guest flags
292 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
293 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
294 or [xSP], T0 ; merge guest flags with host flags.
295 popf ; load the mixed flags.
296
297 %elif (%3 + %4) != 0
298 %if 1 ; This approach seems faster on intel 10980XE
299 %if (%3 | %4) == X86_EFL_CF
300 ; Use bt to load bit into CF
301 bt dword [%1], X86_EFL_CF_BIT
302 %else
303 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
304 mov eax, [%1]
305 %if (%3 | %4) == X86_EFL_OF
306 ; Use ADD to set OF.
307 shl eax, 31 - X86_EFL_OF_BIT
308 add eax, 80000000h
309 %elif ((%3 | %4) & X86_EFL_OF) != 0
310 ; Use ADD to set OF.
311 xchg al, ah
312 shl al, 15 - X86_EFL_OF_BIT
313 add al, 80h
314 ; Use SAHF to set the other status flags.
315 sahf
316 %else ; OF not needed; so al -> ah and load ah into eflags.
317 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
318 shl eax, 8
319 %else
320 xchg al, ah
321 %endif
322 sahf
323 %endif
324 %endif
325
326 %else
327 pushf ; store current flags
328 mov T0_32, [%1] ; load the guest flags
329 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
330 and T0_32, (%2 | %3) ; select the modified and undefined flags.
331 or [xSP], T0 ; merge guest flags with host flags.
332 popf ; load the mixed flags.
333 %endif
334 %endif
335%endmacro
336
337;;
338; Load the relevant flags from [%1].
339;
340; @remarks Clobbers T0, stack. Changes EFLAGS.
341; @param 1 The parameter (A0..A3) pointing to the eflags.
342; @param 2 The set of flags to load.
343; @param 3 The set of undefined flags.
344;
345%macro IEM_LOAD_FLAGS 3
346 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
347 pushf ; store current flags
348 mov T0_32, [%1] ; load the guest flags
349 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
350 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
351 or [xSP], T0 ; merge guest flags with host flags.
352 popf ; load the mixed flags.
353
354 %elif 1 ; This approach seems faster on intel 10980XE
355 %if (%3 | %2) == X86_EFL_CF
356 ; Use bt to load bit into CF
357 bt dword [%1], X86_EFL_CF_BIT
358 %else
359 mov eax, [%1] ; ASSUMES T0_32 is eax!!
360 %if (%3 | %2) == X86_EFL_OF
361 ; Use ADD to set OF.
362 shl eax, 31 - X86_EFL_OF_BIT
363 add eax, 80000000h
364 %elif ((%3 | %2) & X86_EFL_OF) != 0
365 ; Use ADD to set OF.
366 xchg al, ah
367 shl al, 15 - X86_EFL_OF_BIT
368 add al, 80h
369 ; Use SAHF to set the other status flags.
370 sahf
371 %else ; OF not needed; so al -> ah and load ah into eflags.
372 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
373 shl eax, 8
374 %else
375 xchg al, ah
376 %endif
377 sahf
378 %endif
379 %endif ; (%3 | %2) != X86_EFL_CF
380
381 %else
382 pushf ; store current flags
383 mov T0_32, [%1] ; load the guest flags
384 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
385 and T0_32, (%2 | %3) ; select the modified and undefined flags.
386 or [xSP], T0 ; merge guest flags with host flags.
387 popf ; load the mixed flags.
388 %endif
389%endmacro
390
391;;
392; Update the flag.
393;
394; @remarks Clobbers T0, T1, stack.
395; @param 1 The register pointing to the EFLAGS.
396; @param 2 The mask of modified flags to save.
397; @param 3 The mask of undefined flags to (maybe) save.
398; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
399;
400%macro IEM_SAVE_FLAGS 3-4 0
401 %if (%2 | %3 | %4) != 0
402 mov T1_32, [%1] ; flags
403 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
404 pushf
405 pop T0
406 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
407 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
408 %else
409 %if (%2 | %3 | %4) == X86_EFL_CF
410 setc T0_8
411 %elif (%2 | %3) == X86_EFL_OF
412 seto T0_8
413 shl T0_32, X86_EFL_OF_BIT
414 %elif (%2 | %3) == X86_EFL_ZF
415 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
416 shl T0_32, X86_EFL_ZF_BIT
417 %elif (%2 | %3) <= 0xff
418 lahf
419 movzx eax, ah ; ASSUMES T0_32 is eax!
420 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
421 lahf ; while there seems only to be a tiny advantage in most other test.
422 movzx eax, ah ; ASSUMES T0_32 is eax!
423 jno .of_is_clear
424 or eax, X86_EFL_OF
425.of_is_clear:
426 %else
427 pushf ; this is a bit slow
428 pop T0
429 %endif
430 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
431 and T0_32, (%2 | %3) ; select the modified and undefined flags.
432 %endif
433 or T0_32, T1_32 ; combine the flags.
434 mov [%1], T0_32 ; save the flags.
435 %endif
436%endmacro
437
438;;
439; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
440;
441; @remarks Clobbers T0, T1, stack.
442; @param 1 The register pointing to the EFLAGS.
443; @param 2 The mask of modified flags to save.
444; @param 3 Mask of additional flags to always clear
445; @param 4 Mask of additional flags to always set.
446;
447%macro IEM_SAVE_AND_ADJUST_FLAGS 4
448 %if (%2 | %3 | %4) != 0
449 pushf
450 pop T1
451 mov T0_32, [%1] ; load flags.
452 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
453 and T1_32, (%2) ; select the modified flags.
454 or T0_32, T1_32 ; combine the flags.
455 %if (%4) != 0
456 or T0_32, %4 ; add the always set flags.
457 %endif
458 mov [%1], T0_32 ; save the result.
459 %endif
460%endmacro
461
462;;
463; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
464; signed input (%4[%5]) and parity index (%6).
465;
466; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
467; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
468; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
469;
470; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
471; @param 1 The register pointing to the EFLAGS.
472; @param 2 The mask of modified flags to save.
473; @param 3 Mask of additional flags to always clear
474; @param 4 The result register to set SF by.
475; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
476; @param 6 The (full) register containing the parity table index. Will be modified!
477
478%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
479 %ifdef RT_ARCH_AMD64
480 pushf
481 pop T2
482 %else
483 push T0
484 pushf
485 pop T0
486 %endif
487 mov T1_32, [%1] ; load flags.
488 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
489 %ifdef RT_ARCH_AMD64
490 and T2_32, (%2) ; select the modified flags.
491 or T1_32, T2_32 ; combine the flags.
492 %else
493 and T0_32, (%2) ; select the modified flags.
494 or T1_32, T0_32 ; combine the flags.
495 pop T0
496 %endif
497
498 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
499 bt %4, %5 - 1
500 jnc %%sf_clear
501 or T1_32, X86_EFL_SF
502 %%sf_clear:
503
504 ; Parity last.
505 and %6, 0xff
506 %ifdef RT_ARCH_AMD64
507 lea T2, [NAME(g_afParity) xWrtRIP]
508 or T1_8, [T2 + %6]
509 %else
510 or T1_8, [NAME(g_afParity) + %6]
511 %endif
512
513 mov [%1], T1_32 ; save the result.
514%endmacro
515
516;;
517; Calculates the new EFLAGS using fixed clear and set bit masks.
518;
519; @remarks Clobbers T0.
520; @param 1 The register pointing to the EFLAGS.
521; @param 2 Mask of additional flags to always clear
522; @param 3 Mask of additional flags to always set.
523;
524%macro IEM_ADJUST_FLAGS 3
525 %if (%2 | %3) != 0
526 mov T0_32, [%1] ; Load flags.
527 %if (%2) != 0
528 and T0_32, ~(%2) ; Remove the always cleared flags.
529 %endif
530 %if (%3) != 0
531 or T0_32, %3 ; Add the always set flags.
532 %endif
533 mov [%1], T0_32 ; Save the result.
534 %endif
535%endmacro
536
537;;
538; Calculates the new EFLAGS using fixed clear and set bit masks.
539;
540; @remarks Clobbers T0, %4, EFLAGS.
541; @param 1 The register pointing to the EFLAGS.
542; @param 2 Mask of additional flags to always clear
543; @param 3 Mask of additional flags to always set.
544; @param 4 The (full) register containing the parity table index. Will be modified!
545;
546%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
547 mov T0_32, [%1] ; Load flags.
548 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
549 %if (%3) != 0
550 or T0_32, %3 ; Add the always set flags.
551 %endif
552 and %4, 0xff
553 %ifdef RT_ARCH_AMD64
554 lea T2, [NAME(g_afParity) xWrtRIP]
555 or T0_8, [T2 + %4]
556 %else
557 or T0_8, [NAME(g_afParity) + %4]
558 %endif
559 mov [%1], T0_32 ; Save the result.
560%endmacro
561
562
563;;
564; Checks that the size expression %1 matches %2 adjusted according to
565; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
566; @param 1 The jump array size assembly expression.
567; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
568;
569%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
570 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
571 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
572 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
573 %else
574 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
575 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
576 %endif
577%endmacro
578
579
580;*********************************************************************************************************************************
581;* External Symbols *
582;*********************************************************************************************************************************
583extern NAME(g_afParity)
584
585
586;;
587; Macro for implementing a binary operator.
588;
589; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
590; variants, except on 32-bit system where the 64-bit accesses requires hand
591; coding.
592;
593; All the functions takes a pointer to the destination memory operand in A0,
594; the source register operand in A1 and a pointer to eflags in A2.
595;
596; @param 1 The instruction mnemonic.
597; @param 2 Non-zero if there should be a locked version.
598; @param 3 The modified flags.
599; @param 4 The undefined flags.
600; @param 5 The flags that must be loaded (ADC, SBC).
601; @param 6 The flags that will be zeroed by the operation.
602;
603%macro IEMIMPL_BIN_OP 6
604BEGINCODE
605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
606 PROLOGUE_3_ARGS
607 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
608 %1 byte [A0], A1_8
609 IEM_SAVE_FLAGS A2, %3, %4, %6
610 EPILOGUE_3_ARGS
611ENDPROC iemAImpl_ %+ %1 %+ _u8
612
613BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
614 PROLOGUE_3_ARGS
615 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
616 %1 word [A0], A1_16
617 IEM_SAVE_FLAGS A2, %3, %4, %6
618 EPILOGUE_3_ARGS
619ENDPROC iemAImpl_ %+ %1 %+ _u16
620
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
622 PROLOGUE_3_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
624 %1 dword [A0], A1_32
625 IEM_SAVE_FLAGS A2, %3, %4, %6
626 EPILOGUE_3_ARGS
627ENDPROC iemAImpl_ %+ %1 %+ _u32
628
629 %ifdef RT_ARCH_AMD64
630BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
631 PROLOGUE_3_ARGS
632 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
633 %1 qword [A0], A1
634 IEM_SAVE_FLAGS A2, %3, %4, %6
635 EPILOGUE_3_ARGS_EX 8
636ENDPROC iemAImpl_ %+ %1 %+ _u64
637 %endif ; RT_ARCH_AMD64
638
639 %if %2 != 0 ; locked versions requested?
640
641BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
642 PROLOGUE_3_ARGS
643 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
644 lock %1 byte [A0], A1_8
645 IEM_SAVE_FLAGS A2, %3, %4, %6
646 EPILOGUE_3_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
648
649BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
650 PROLOGUE_3_ARGS
651 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
652 lock %1 word [A0], A1_16
653 IEM_SAVE_FLAGS A2, %3, %4, %6
654 EPILOGUE_3_ARGS
655ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
656
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
658 PROLOGUE_3_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
660 lock %1 dword [A0], A1_32
661 IEM_SAVE_FLAGS A2, %3, %4, %6
662 EPILOGUE_3_ARGS
663ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
664
665 %ifdef RT_ARCH_AMD64
666BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
667 PROLOGUE_3_ARGS
668 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
669 lock %1 qword [A0], A1
670 IEM_SAVE_FLAGS A2, %3, %4, %6
671 EPILOGUE_3_ARGS_EX 8
672ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
673 %endif ; RT_ARCH_AMD64
674 %endif ; locked
675%endmacro
676
677; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
678IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
679IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
680IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
681IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
682IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
683IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
684IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
685IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
686IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
687
688
689;;
690; Macro for implementing a binary operator, VEX variant with separate input/output.
691;
692; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
693; where the 64-bit accesses requires hand coding.
694;
695; All the functions takes a pointer to the destination memory operand in A0,
696; the first source register operand in A1, the second source register operand
697; in A2 and a pointer to eflags in A3.
698;
699; @param 1 The instruction mnemonic.
700; @param 2 The modified flags.
701; @param 3 The undefined flags.
702; @param 4 The zeroed flags.
703;
704%macro IEMIMPL_VEX_BIN_OP 4
705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
706 PROLOGUE_4_ARGS
707 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
708 %1 T0_32, A1_32, A2_32
709 mov [A0], T0_32
710 IEM_SAVE_FLAGS A3, %2, %3, %4
711 EPILOGUE_4_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u32
713
714 %ifdef RT_ARCH_AMD64
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
716 PROLOGUE_4_ARGS
717 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, 0
718 %1 T0, A1, A2
719 mov [A0], T0
720 IEM_SAVE_FLAGS A3, %2, %3, %4
721 EPILOGUE_4_ARGS
722ENDPROC iemAImpl_ %+ %1 %+ _u64
723 %endif ; RT_ARCH_AMD64
724%endmacro
725
726; instr, modified-flags, undefined-flags, zeroed-flags
727IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
728IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
729IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
730
731;;
732; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
733;
734; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
735; where the 64-bit accesses requires hand coding.
736;
737; All the functions takes a pointer to the destination memory operand in A0,
738; the source register operand in A1 and a pointer to eflags in A2.
739;
740; @param 1 The instruction mnemonic.
741; @param 2 The modified flags.
742; @param 3 The undefined flags.
743; @param 4 The zeroed flags.
744;
745%macro IEMIMPL_VEX_BIN_OP_2 4
746BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
747 PROLOGUE_4_ARGS
748 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
749 mov T0_32, [A0]
750 %1 T0_32, A1_32
751 mov [A0], T0_32
752 IEM_SAVE_FLAGS A2, %2, %3, %4
753 EPILOGUE_4_ARGS
754ENDPROC iemAImpl_ %+ %1 %+ _u32
755
756 %ifdef RT_ARCH_AMD64
757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
758 PROLOGUE_4_ARGS
759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
760 mov T0, [A0]
761 %1 T0, A1
762 mov [A0], T0
763 IEM_SAVE_FLAGS A2, %2, %3, %4
764 EPILOGUE_4_ARGS
765ENDPROC iemAImpl_ %+ %1 %+ _u64
766 %endif ; RT_ARCH_AMD64
767%endmacro
768
769; instr, modified-flags, undefined-flags zeroed-flags
770IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
771IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
772IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
773
774
775;;
776; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
777;
778; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
779; where the 64-bit accesses requires hand coding.
780;
781; All the functions takes a pointer to the destination memory operand in A0,
782; the first source register operand in A1, the second source register operand
783; in A2 and a pointer to eflags in A3.
784;
785; @param 1 The instruction mnemonic.
786; @param 2 Fallback instruction if applicable.
787; @param 3 Whether to emit fallback or not.
788;
789%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
791 PROLOGUE_3_ARGS
792 %1 T0_32, A1_32, A2_32
793 mov [A0], T0_32
794 EPILOGUE_3_ARGS
795ENDPROC iemAImpl_ %+ %1 %+ _u32
796
797 %if %3
798BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
799 PROLOGUE_3_ARGS
800 %ifdef ASM_CALL64_GCC
801 mov cl, A2_8
802 %2 A1_32, cl
803 mov [A0], A1_32
804 %else
805 xchg A2, A0
806 %2 A1_32, cl
807 mov [A2], A1_32
808 %endif
809 EPILOGUE_3_ARGS
810ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
811 %endif
812
813 %ifdef RT_ARCH_AMD64
814BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
815 PROLOGUE_3_ARGS
816 %1 T0, A1, A2
817 mov [A0], T0
818 EPILOGUE_3_ARGS
819ENDPROC iemAImpl_ %+ %1 %+ _u64
820
821 %if %3
822BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
823 PROLOGUE_3_ARGS
824 %ifdef ASM_CALL64_GCC
825 mov cl, A2_8
826 %2 A1, cl
827 mov [A0], A1_32
828 %else
829 xchg A2, A0
830 %2 A1, cl
831 mov [A2], A1_32
832 %endif
833 mov [A0], A1
834 EPILOGUE_3_ARGS
835ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
836 %endif
837 %endif ; RT_ARCH_AMD64
838%endmacro
839
840; instr, fallback instr, emit fallback
841IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
842IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
843IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
844IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
845IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
846
847
848;
849; RORX uses a immediate byte for the shift count, so we only do
850; fallback implementation of that one.
851;
852BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
853 PROLOGUE_3_ARGS
854 %ifdef ASM_CALL64_GCC
855 mov cl, A2_8
856 ror A1_32, cl
857 mov [A0], A1_32
858 %else
859 xchg A2, A0
860 ror A1_32, cl
861 mov [A2], A1_32
862 %endif
863 EPILOGUE_3_ARGS
864ENDPROC iemAImpl_rorx_u32
865
866 %ifdef RT_ARCH_AMD64
867BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
868 PROLOGUE_3_ARGS
869 %ifdef ASM_CALL64_GCC
870 mov cl, A2_8
871 ror A1, cl
872 mov [A0], A1
873 %else
874 xchg A2, A0
875 ror A1, cl
876 mov [A2], A1
877 %endif
878 EPILOGUE_3_ARGS
879ENDPROC iemAImpl_rorx_u64
880 %endif ; RT_ARCH_AMD64
881
882
883;
884; MULX
885;
886BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
887 PROLOGUE_4_ARGS
888%ifdef ASM_CALL64_GCC
889 ; A2_32 is EDX - prefect
890 mulx T0_32, T1_32, A3_32
891 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
892 mov [A0], T0_32
893%else
894 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
895 xchg A1, A2
896 mulx T0_32, T1_32, A3_32
897 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
898 mov [A0], T0_32
899%endif
900 EPILOGUE_4_ARGS
901ENDPROC iemAImpl_mulx_u32
902
903
904BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
905 PROLOGUE_4_ARGS
906%ifdef ASM_CALL64_GCC
907 ; A2_32 is EDX, T0_32 is EAX
908 mov eax, A3_32
909 mul A2_32
910 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
911 mov [A0], edx
912%else
913 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
914 xchg A1, A2
915 mov eax, A3_32
916 mul A2_32
917 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
918 mov [A0], edx
919%endif
920 EPILOGUE_4_ARGS
921ENDPROC iemAImpl_mulx_u32_fallback
922
923%ifdef RT_ARCH_AMD64
924BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
925 PROLOGUE_4_ARGS
926%ifdef ASM_CALL64_GCC
927 ; A2 is RDX - prefect
928 mulx T0, T1, A3
929 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
930 mov [A0], T0
931%else
932 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
933 xchg A1, A2
934 mulx T0, T1, A3
935 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
936 mov [A0], T0
937%endif
938 EPILOGUE_4_ARGS
939ENDPROC iemAImpl_mulx_u64
940
941
942BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
943 PROLOGUE_4_ARGS
944%ifdef ASM_CALL64_GCC
945 ; A2 is RDX, T0 is RAX
946 mov rax, A3
947 mul A2
948 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
949 mov [A0], rdx
950%else
951 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
952 xchg A1, A2
953 mov rax, A3
954 mul A2
955 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
956 mov [A0], rdx
957%endif
958 EPILOGUE_4_ARGS
959ENDPROC iemAImpl_mulx_u64_fallback
960
961%endif
962
963
964;;
965; Macro for implementing a bit operator.
966;
967; This will generate code for the 16, 32 and 64 bit accesses with locked
968; variants, except on 32-bit system where the 64-bit accesses requires hand
969; coding.
970;
971; All the functions takes a pointer to the destination memory operand in A0,
972; the source register operand in A1 and a pointer to eflags in A2.
973;
974; @param 1 The instruction mnemonic.
975; @param 2 Non-zero if there should be a locked version.
976; @param 3 The modified flags.
977; @param 4 The undefined flags.
978;
979%macro IEMIMPL_BIT_OP 4
980BEGINCODE
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
982 PROLOGUE_3_ARGS
983 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
984 %1 word [A0], A1_16
985 IEM_SAVE_FLAGS A2, %3, %4, 0
986 EPILOGUE_3_ARGS
987ENDPROC iemAImpl_ %+ %1 %+ _u16
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
992 %1 dword [A0], A1_32
993 IEM_SAVE_FLAGS A2, %3, %4, 0
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u32
996
997 %ifdef RT_ARCH_AMD64
998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
999 PROLOGUE_3_ARGS
1000 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
1001 %1 qword [A0], A1
1002 IEM_SAVE_FLAGS A2, %3, %4, 0
1003 EPILOGUE_3_ARGS_EX 8
1004ENDPROC iemAImpl_ %+ %1 %+ _u64
1005 %endif ; RT_ARCH_AMD64
1006
1007 %if %2 != 0 ; locked versions requested?
1008
1009BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1010 PROLOGUE_3_ARGS
1011 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
1012 lock %1 word [A0], A1_16
1013 IEM_SAVE_FLAGS A2, %3, %4, 0
1014 EPILOGUE_3_ARGS
1015ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1016
1017BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1018 PROLOGUE_3_ARGS
1019 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
1020 lock %1 dword [A0], A1_32
1021 IEM_SAVE_FLAGS A2, %3, %4, 0
1022 EPILOGUE_3_ARGS
1023ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1024
1025 %ifdef RT_ARCH_AMD64
1026BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1027 PROLOGUE_3_ARGS
1028 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, 0
1029 lock %1 qword [A0], A1
1030 IEM_SAVE_FLAGS A2, %3, %4, 0
1031 EPILOGUE_3_ARGS_EX 8
1032ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1033 %endif ; RT_ARCH_AMD64
1034 %endif ; locked
1035%endmacro
1036
1037; Undefined flags are passed thru here by the intel and amd CPUs we have.
1038; modified efl, undefined eflags
1039IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1040IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1041IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1042IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1043
1044;;
1045; Macro for implementing a bit search operator.
1046;
1047; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1048; system where the 64-bit accesses requires hand coding.
1049;
1050; All the functions takes a pointer to the destination memory operand in A0,
1051; the source register operand in A1 and a pointer to eflags in A2.
1052;
1053; In the ZF case the destination register is 'undefined', however it seems that
1054; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1055; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1056; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1057; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1058;
1059; Intel: Clear all and calculate PF in addition to ZF.
1060; AMD: Passthru all flags other than ZF.
1061;
1062; @param 1 The instruction mnemonic.
1063; @param 2 The modified flags.
1064; @param 3 The undefined flags.
1065; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1066;
1067%macro IEMIMPL_BIT_OP2 4
1068BEGINCODE
1069; 16-bit
1070
1071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1072 PROLOGUE_3_ARGS
1073 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1074 %1 T0_16, A1_16
1075%if %4 != 0
1076 jz .unchanged_dst
1077%endif
1078 mov [A0], T0_16
1079.unchanged_dst:
1080 IEM_SAVE_FLAGS A2, %2, %3, 0
1081 EPILOGUE_3_ARGS
1082ENDPROC iemAImpl_ %+ %1 %+ _u16
1083
1084;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1085;bad; PROLOGUE_3_ARGS
1086;bad; %1 T1_16, A1_16
1087;bad; jz .unchanged_dst
1088;bad; mov [A0], T1_16
1089;bad; IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1090;bad; EPILOGUE_3_ARGS
1091;bad;.unchanged_dst:
1092;bad;%if %4 != 0
1093;bad; mov [A0], T1_16
1094;bad;%endif
1095;bad; IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1096;bad; EPILOGUE_3_ARGS
1097;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1098;bad;
1099;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1100;bad; PROLOGUE_3_ARGS
1101;bad; %1 T0_16, A1_16
1102;bad;%if %4 != 0
1103;bad; jz .unchanged_dst
1104;bad;%endif
1105;bad; mov [A0], T0_16
1106;bad;.unchanged_dst:
1107;bad; IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1108;bad; EPILOGUE_3_ARGS
1109;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1110
1111; 32-bit
1112
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1116 %1 T0_32, A1_32
1117%if %4 != 0
1118 jz .unchanged_dst
1119%endif
1120 mov [A0], T0_32
1121.unchanged_dst:
1122 IEM_SAVE_FLAGS A2, %2, %3, 0
1123 EPILOGUE_3_ARGS
1124ENDPROC iemAImpl_ %+ %1 %+ _u32
1125
1126;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1127;bad; PROLOGUE_3_ARGS
1128;bad; %1 T1_32, A1_32
1129;bad;%if %4 != 0
1130;bad; jz .unchanged_dst
1131;bad;%endif
1132;bad; mov [A0], T1_32
1133;bad; IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1134;bad; EPILOGUE_3_ARGS
1135;bad;.unchanged_dst:
1136;bad; IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1137;bad; EPILOGUE_3_ARGS
1138;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1139;bad;
1140;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1141;bad; PROLOGUE_3_ARGS
1142;bad; %1 T0_32, A1_32
1143;bad;%if %4 != 0
1144;bad; jz .unchanged_dst
1145;bad;%endif
1146;bad; mov [A0], T0_32
1147;bad;.unchanged_dst:
1148;bad; IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1149;bad; EPILOGUE_3_ARGS
1150;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1151
1152
1153 %ifdef RT_ARCH_AMD64
1154; 64-bit
1155
1156BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1159 %1 T0, A1
1160%if %4 != 0
1161 jz .unchanged_dst
1162%endif
1163 mov [A0], T0
1164.unchanged_dst:
1165 IEM_SAVE_FLAGS A2, %2, %3, 0
1166 EPILOGUE_3_ARGS_EX 8
1167ENDPROC iemAImpl_ %+ %1 %+ _u64
1168
1169;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1170;bad; PROLOGUE_3_ARGS
1171;bad; %1 T1, A1
1172;bad;%if %4 != 0
1173;bad; jz .unchanged_dst
1174;bad;%endif
1175;bad; mov [A0], T1
1176;bad; IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1177;bad; EPILOGUE_3_ARGS
1178;bad;.unchanged_dst:
1179;bad; IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1180;bad; EPILOGUE_3_ARGS
1181;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1182;bad;
1183;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1184;bad; PROLOGUE_3_ARGS
1185;bad; %1 T0, A1
1186;bad;%if %4 != 0
1187;bad; jz .unchanged_dst
1188;bad;%endif
1189;bad; mov [A0], T0
1190;bad;.unchanged_dst:
1191;bad; IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1192;bad; EPILOGUE_3_ARGS_EX 8
1193;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1194
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197
1198IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1199IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1200IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1201IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1202
1203
1204;;
1205; Macro for implementing POPCNT.
1206;
1207; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1208; system where the 64-bit accesses requires hand coding.
1209;
1210; All the functions takes a pointer to the destination memory operand in A0,
1211; the source register operand in A1 and a pointer to eflags in A2.
1212;
1213; ASSUMES Intel and AMD set EFLAGS the same way.
1214;
1215; ASSUMES the instruction does not support memory destination.
1216;
1217; @param 1 The instruction mnemonic.
1218; @param 2 The modified flags.
1219; @param 3 The undefined flags.
1220; @param 4 The zeroed flags.
1221;
1222%macro IEMIMPL_BIT_OP3 4
1223BEGINCODE
1224BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1225 PROLOGUE_3_ARGS
1226 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1227 %1 T0_16, A1_16
1228 mov [A0], T0_16
1229 IEM_SAVE_FLAGS A2, %2, %3, %4
1230 EPILOGUE_3_ARGS
1231ENDPROC iemAImpl_ %+ %1 %+ _u16
1232
1233BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1234 PROLOGUE_3_ARGS
1235 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1236 %1 T0_32, A1_32
1237 mov [A0], T0_32
1238 IEM_SAVE_FLAGS A2, %2, %3, %4
1239 EPILOGUE_3_ARGS
1240ENDPROC iemAImpl_ %+ %1 %+ _u32
1241
1242 %ifdef RT_ARCH_AMD64
1243BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1244 PROLOGUE_3_ARGS
1245 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1246 %1 T0, A1
1247 mov [A0], T0
1248 IEM_SAVE_FLAGS A2, %2, %3, %4
1249 EPILOGUE_3_ARGS_EX 8
1250ENDPROC iemAImpl_ %+ %1 %+ _u64
1251 %endif ; RT_ARCH_AMD64
1252%endmacro
1253IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1254
1255
1256;
1257; IMUL is also a similar but yet different case (no lock, no mem dst).
1258; The rDX:rAX variant of imul is handled together with mul further down.
1259;
1260BEGINCODE
1261; @param 1 EFLAGS that are modified.
1262; @param 2 Undefined EFLAGS.
1263; @param 3 Function suffix.
1264; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1265; 2 for AMD (set AF, clear PF, ZF and SF).
1266%macro IEMIMPL_IMUL_TWO 4
1267BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1270 imul A1_16, word [A0]
1271 mov [A0], A1_16
1272 %if %4 != 1
1273 IEM_SAVE_FLAGS A2, %1, %2, 0
1274 %else
1275 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1 ; intel
1276 %endif
1277 EPILOGUE_3_ARGS
1278ENDPROC iemAImpl_imul_two_u16 %+ %3
1279
1280BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1281 PROLOGUE_3_ARGS
1282 IEM_MAYBE_LOAD_FLAGS A2, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1283 imul A1_32, dword [A0]
1284 mov [A0], A1_32
1285 %if %4 != 1
1286 IEM_SAVE_FLAGS A2, %1, %2, 0
1287 %else
1288 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1 ; intel
1289 %endif
1290 EPILOGUE_3_ARGS
1291ENDPROC iemAImpl_imul_two_u32 %+ %3
1292
1293 %ifdef RT_ARCH_AMD64
1294BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1295 PROLOGUE_3_ARGS
1296 IEM_MAYBE_LOAD_FLAGS A2, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1297 imul A1, qword [A0]
1298 mov [A0], A1
1299 %if %4 != 1
1300 IEM_SAVE_FLAGS A2, %1, %2, 0
1301 %else
1302 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1 ; intel
1303 %endif
1304 EPILOGUE_3_ARGS_EX 8
1305ENDPROC iemAImpl_imul_two_u64 %+ %3
1306 %endif ; RT_ARCH_AMD64
1307%endmacro
1308; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1309; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1310; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1311IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1312IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1313IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1314
1315
1316;
1317; XCHG for memory operands. This implies locking. No flag changes.
1318;
1319; Each function takes two arguments, first the pointer to the memory,
1320; then the pointer to the register. They all return void.
1321;
1322BEGINCODE
1323BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1324 PROLOGUE_2_ARGS
1325 mov T0_8, [A1]
1326 xchg [A0], T0_8
1327 mov [A1], T0_8
1328 EPILOGUE_2_ARGS
1329ENDPROC iemAImpl_xchg_u8_locked
1330
1331BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1332 PROLOGUE_2_ARGS
1333 mov T0_16, [A1]
1334 xchg [A0], T0_16
1335 mov [A1], T0_16
1336 EPILOGUE_2_ARGS
1337ENDPROC iemAImpl_xchg_u16_locked
1338
1339BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1340 PROLOGUE_2_ARGS
1341 mov T0_32, [A1]
1342 xchg [A0], T0_32
1343 mov [A1], T0_32
1344 EPILOGUE_2_ARGS
1345ENDPROC iemAImpl_xchg_u32_locked
1346
1347%ifdef RT_ARCH_AMD64
1348BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1349 PROLOGUE_2_ARGS
1350 mov T0, [A1]
1351 xchg [A0], T0
1352 mov [A1], T0
1353 EPILOGUE_2_ARGS
1354ENDPROC iemAImpl_xchg_u64_locked
1355%endif
1356
1357; Unlocked variants for fDisregardLock mode.
1358
1359BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1360 PROLOGUE_2_ARGS
1361 mov T0_8, [A1]
1362 mov T1_8, [A0]
1363 mov [A0], T0_8
1364 mov [A1], T1_8
1365 EPILOGUE_2_ARGS
1366ENDPROC iemAImpl_xchg_u8_unlocked
1367
1368BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1369 PROLOGUE_2_ARGS
1370 mov T0_16, [A1]
1371 mov T1_16, [A0]
1372 mov [A0], T0_16
1373 mov [A1], T1_16
1374 EPILOGUE_2_ARGS
1375ENDPROC iemAImpl_xchg_u16_unlocked
1376
1377BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1378 PROLOGUE_2_ARGS
1379 mov T0_32, [A1]
1380 mov T1_32, [A0]
1381 mov [A0], T0_32
1382 mov [A1], T1_32
1383 EPILOGUE_2_ARGS
1384ENDPROC iemAImpl_xchg_u32_unlocked
1385
1386%ifdef RT_ARCH_AMD64
1387BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1388 PROLOGUE_2_ARGS
1389 mov T0, [A1]
1390 mov T1, [A0]
1391 mov [A0], T0
1392 mov [A1], T1
1393 EPILOGUE_2_ARGS
1394ENDPROC iemAImpl_xchg_u64_unlocked
1395%endif
1396
1397
1398;
1399; XADD for memory operands.
1400;
1401; Each function takes three arguments, first the pointer to the
1402; memory/register, then the pointer to the register, and finally a pointer to
1403; eflags. They all return void.
1404;
1405BEGINCODE
1406BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1407 PROLOGUE_3_ARGS
1408 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1409 mov T0_8, [A1]
1410 xadd [A0], T0_8
1411 mov [A1], T0_8
1412 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1413 EPILOGUE_3_ARGS
1414ENDPROC iemAImpl_xadd_u8
1415
1416BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1417 PROLOGUE_3_ARGS
1418 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1419 mov T0_16, [A1]
1420 xadd [A0], T0_16
1421 mov [A1], T0_16
1422 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1423 EPILOGUE_3_ARGS
1424ENDPROC iemAImpl_xadd_u16
1425
1426BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1427 PROLOGUE_3_ARGS
1428 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1429 mov T0_32, [A1]
1430 xadd [A0], T0_32
1431 mov [A1], T0_32
1432 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1433 EPILOGUE_3_ARGS
1434ENDPROC iemAImpl_xadd_u32
1435
1436%ifdef RT_ARCH_AMD64
1437BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1438 PROLOGUE_3_ARGS
1439 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1440 mov T0, [A1]
1441 xadd [A0], T0
1442 mov [A1], T0
1443 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1444 EPILOGUE_3_ARGS
1445ENDPROC iemAImpl_xadd_u64
1446%endif ; RT_ARCH_AMD64
1447
1448BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1449 PROLOGUE_3_ARGS
1450 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1451 mov T0_8, [A1]
1452 lock xadd [A0], T0_8
1453 mov [A1], T0_8
1454 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1455 EPILOGUE_3_ARGS
1456ENDPROC iemAImpl_xadd_u8_locked
1457
1458BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1459 PROLOGUE_3_ARGS
1460 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1461 mov T0_16, [A1]
1462 lock xadd [A0], T0_16
1463 mov [A1], T0_16
1464 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1465 EPILOGUE_3_ARGS
1466ENDPROC iemAImpl_xadd_u16_locked
1467
1468BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1469 PROLOGUE_3_ARGS
1470 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1471 mov T0_32, [A1]
1472 lock xadd [A0], T0_32
1473 mov [A1], T0_32
1474 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1475 EPILOGUE_3_ARGS
1476ENDPROC iemAImpl_xadd_u32_locked
1477
1478%ifdef RT_ARCH_AMD64
1479BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1480 PROLOGUE_3_ARGS
1481 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1482 mov T0, [A1]
1483 lock xadd [A0], T0
1484 mov [A1], T0
1485 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1486 EPILOGUE_3_ARGS
1487ENDPROC iemAImpl_xadd_u64_locked
1488%endif ; RT_ARCH_AMD64
1489
1490
1491;
1492; CMPXCHG8B.
1493;
1494; These are tricky register wise, so the code is duplicated for each calling
1495; convention.
1496;
1497; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1498;
1499; C-proto:
1500; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1501; uint32_t *pEFlags));
1502;
1503; Note! Identical to iemAImpl_cmpxchg16b.
1504;
1505BEGINCODE
1506BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1507%ifdef RT_ARCH_AMD64
1508 %ifdef ASM_CALL64_MSC
1509 push rbx
1510
1511 mov r11, rdx ; pu64EaxEdx (is also T1)
1512 mov r10, rcx ; pu64Dst
1513
1514 mov ebx, [r8]
1515 mov ecx, [r8 + 4]
1516 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1517 mov eax, [r11]
1518 mov edx, [r11 + 4]
1519
1520 cmpxchg8b [r10]
1521
1522 mov [r11], eax
1523 mov [r11 + 4], edx
1524 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1525
1526 pop rbx
1527 ret
1528 %else
1529 push rbx
1530
1531 mov r10, rcx ; pEFlags
1532 mov r11, rdx ; pu64EbxEcx (is also T1)
1533
1534 mov ebx, [r11]
1535 mov ecx, [r11 + 4]
1536 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1537 mov eax, [rsi]
1538 mov edx, [rsi + 4]
1539
1540 cmpxchg8b [rdi]
1541
1542 mov [rsi], eax
1543 mov [rsi + 4], edx
1544 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1545
1546 pop rbx
1547 ret
1548
1549 %endif
1550%else
1551 push esi
1552 push edi
1553 push ebx
1554 push ebp
1555
1556 mov edi, ecx ; pu64Dst
1557 mov esi, edx ; pu64EaxEdx
1558 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1559 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1560
1561 mov ebx, [ecx]
1562 mov ecx, [ecx + 4]
1563 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1564 mov eax, [esi]
1565 mov edx, [esi + 4]
1566
1567 cmpxchg8b [edi]
1568
1569 mov [esi], eax
1570 mov [esi + 4], edx
1571 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1572
1573 pop ebp
1574 pop ebx
1575 pop edi
1576 pop esi
1577 ret 8
1578%endif
1579ENDPROC iemAImpl_cmpxchg8b
1580
1581BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1582%ifdef RT_ARCH_AMD64
1583 %ifdef ASM_CALL64_MSC
1584 push rbx
1585
1586 mov r11, rdx ; pu64EaxEdx (is also T1)
1587 mov r10, rcx ; pu64Dst
1588
1589 mov ebx, [r8]
1590 mov ecx, [r8 + 4]
1591 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1592 mov eax, [r11]
1593 mov edx, [r11 + 4]
1594
1595 lock cmpxchg8b [r10]
1596
1597 mov [r11], eax
1598 mov [r11 + 4], edx
1599 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1600
1601 pop rbx
1602 ret
1603 %else
1604 push rbx
1605
1606 mov r10, rcx ; pEFlags
1607 mov r11, rdx ; pu64EbxEcx (is also T1)
1608
1609 mov ebx, [r11]
1610 mov ecx, [r11 + 4]
1611 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1612 mov eax, [rsi]
1613 mov edx, [rsi + 4]
1614
1615 lock cmpxchg8b [rdi]
1616
1617 mov [rsi], eax
1618 mov [rsi + 4], edx
1619 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1620
1621 pop rbx
1622 ret
1623
1624 %endif
1625%else
1626 push esi
1627 push edi
1628 push ebx
1629 push ebp
1630
1631 mov edi, ecx ; pu64Dst
1632 mov esi, edx ; pu64EaxEdx
1633 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1634 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1635
1636 mov ebx, [ecx]
1637 mov ecx, [ecx + 4]
1638 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1639 mov eax, [esi]
1640 mov edx, [esi + 4]
1641
1642 lock cmpxchg8b [edi]
1643
1644 mov [esi], eax
1645 mov [esi + 4], edx
1646 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1647
1648 pop ebp
1649 pop ebx
1650 pop edi
1651 pop esi
1652 ret 8
1653%endif
1654ENDPROC iemAImpl_cmpxchg8b_locked
1655
1656%ifdef RT_ARCH_AMD64
1657
1658;
1659; CMPXCHG16B.
1660;
1661; These are tricky register wise, so the code is duplicated for each calling
1662; convention.
1663;
1664; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1665;
1666; C-proto:
1667; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1668; uint32_t *pEFlags));
1669;
1670; Note! Identical to iemAImpl_cmpxchg8b.
1671;
1672BEGINCODE
1673BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1674 %ifdef ASM_CALL64_MSC
1675 push rbx
1676
1677 mov r11, rdx ; pu64RaxRdx (is also T1)
1678 mov r10, rcx ; pu64Dst
1679
1680 mov rbx, [r8]
1681 mov rcx, [r8 + 8]
1682 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1683 mov rax, [r11]
1684 mov rdx, [r11 + 8]
1685
1686 cmpxchg16b [r10]
1687
1688 mov [r11], rax
1689 mov [r11 + 8], rdx
1690 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1691
1692 pop rbx
1693 ret
1694 %else
1695 push rbx
1696
1697 mov r10, rcx ; pEFlags
1698 mov r11, rdx ; pu64RbxRcx (is also T1)
1699
1700 mov rbx, [r11]
1701 mov rcx, [r11 + 8]
1702 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1703 mov rax, [rsi]
1704 mov rdx, [rsi + 8]
1705
1706 cmpxchg16b [rdi]
1707
1708 mov [rsi], rax
1709 mov [rsi + 8], rdx
1710 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1711
1712 pop rbx
1713 ret
1714
1715 %endif
1716ENDPROC iemAImpl_cmpxchg16b
1717
1718BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1719 %ifdef ASM_CALL64_MSC
1720 push rbx
1721
1722 mov r11, rdx ; pu64RaxRdx (is also T1)
1723 mov r10, rcx ; pu64Dst
1724
1725 mov rbx, [r8]
1726 mov rcx, [r8 + 8]
1727 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1728 mov rax, [r11]
1729 mov rdx, [r11 + 8]
1730
1731 lock cmpxchg16b [r10]
1732
1733 mov [r11], rax
1734 mov [r11 + 8], rdx
1735 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1736
1737 pop rbx
1738 ret
1739 %else
1740 push rbx
1741
1742 mov r10, rcx ; pEFlags
1743 mov r11, rdx ; pu64RbxRcx (is also T1)
1744
1745 mov rbx, [r11]
1746 mov rcx, [r11 + 8]
1747 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1748 mov rax, [rsi]
1749 mov rdx, [rsi + 8]
1750
1751 lock cmpxchg16b [rdi]
1752
1753 mov [rsi], rax
1754 mov [rsi + 8], rdx
1755 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1756
1757 pop rbx
1758 ret
1759
1760 %endif
1761ENDPROC iemAImpl_cmpxchg16b_locked
1762
1763%endif ; RT_ARCH_AMD64
1764
1765
1766;
1767; CMPXCHG.
1768;
1769; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1770;
1771; C-proto:
1772; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1773;
1774BEGINCODE
1775%macro IEMIMPL_CMPXCHG 2
1776BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1777 PROLOGUE_4_ARGS
1778 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1779 mov al, [A1]
1780 %1 cmpxchg [A0], A2_8
1781 mov [A1], al
1782 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
1783 EPILOGUE_4_ARGS
1784ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1785
1786BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1787 PROLOGUE_4_ARGS
1788 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1789 mov ax, [A1]
1790 %1 cmpxchg [A0], A2_16
1791 mov [A1], ax
1792 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
1793 EPILOGUE_4_ARGS
1794ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1795
1796BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1797 PROLOGUE_4_ARGS
1798 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1799 mov eax, [A1]
1800 %1 cmpxchg [A0], A2_32
1801 mov [A1], eax
1802 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
1803 EPILOGUE_4_ARGS
1804ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1805
1806BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1807%ifdef RT_ARCH_AMD64
1808 PROLOGUE_4_ARGS
1809 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1810 mov rax, [A1]
1811 %1 cmpxchg [A0], A2
1812 mov [A1], rax
1813 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
1814 EPILOGUE_4_ARGS
1815%else
1816 ;
1817 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1818 ;
1819 push esi
1820 push edi
1821 push ebx
1822 push ebp
1823
1824 mov edi, ecx ; pu64Dst
1825 mov esi, edx ; pu64Rax
1826 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1827 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1828
1829 mov ebx, [ecx]
1830 mov ecx, [ecx + 4]
1831 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1832 mov eax, [esi]
1833 mov edx, [esi + 4]
1834
1835 lock cmpxchg8b [edi]
1836
1837 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1838 jz .cmpxchg8b_not_equal
1839;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1840 cmp eax, eax ; just set the other flags.
1841.store:
1842 mov [esi], eax
1843 mov [esi + 4], edx
1844 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
1845
1846 pop ebp
1847 pop ebx
1848 pop edi
1849 pop esi
1850 ret 8
1851
1852.cmpxchg8b_not_equal:
1853 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1854 jne .store
1855 cmp [esi], eax
1856 jmp .store
1857
1858%endif
1859ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1860%endmacro ; IEMIMPL_CMPXCHG
1861
1862IEMIMPL_CMPXCHG , ,
1863IEMIMPL_CMPXCHG lock, _locked
1864
1865
1866
1867;;
1868; Macro for implementing a unary operator.
1869;
1870; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1871; variants, except on 32-bit system where the 64-bit accesses requires hand
1872; coding.
1873;
1874; All the functions takes a pointer to the destination memory operand in A0,
1875; the source register operand in A1 and a pointer to eflags in A2.
1876;
1877; @param 1 The instruction mnemonic.
1878; @param 2 The modified flags.
1879; @param 3 The undefined flags.
1880;
1881%macro IEMIMPL_UNARY_OP 3
1882BEGINCODE
1883BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1884 PROLOGUE_2_ARGS
1885 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1886 %1 byte [A0]
1887 IEM_SAVE_FLAGS A1, %2, %3, 0
1888 EPILOGUE_2_ARGS
1889ENDPROC iemAImpl_ %+ %1 %+ _u8
1890
1891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1892 PROLOGUE_2_ARGS
1893 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1894 lock %1 byte [A0]
1895 IEM_SAVE_FLAGS A1, %2, %3, 0
1896 EPILOGUE_2_ARGS
1897ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1898
1899BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1900 PROLOGUE_2_ARGS
1901 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1902 %1 word [A0]
1903 IEM_SAVE_FLAGS A1, %2, %3, 0
1904 EPILOGUE_2_ARGS
1905ENDPROC iemAImpl_ %+ %1 %+ _u16
1906
1907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1908 PROLOGUE_2_ARGS
1909 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1910 lock %1 word [A0]
1911 IEM_SAVE_FLAGS A1, %2, %3, 0
1912 EPILOGUE_2_ARGS
1913ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1914
1915BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1916 PROLOGUE_2_ARGS
1917 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1918 %1 dword [A0]
1919 IEM_SAVE_FLAGS A1, %2, %3, 0
1920 EPILOGUE_2_ARGS
1921ENDPROC iemAImpl_ %+ %1 %+ _u32
1922
1923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1924 PROLOGUE_2_ARGS
1925 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1926 lock %1 dword [A0]
1927 IEM_SAVE_FLAGS A1, %2, %3, 0
1928 EPILOGUE_2_ARGS
1929ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1930
1931 %ifdef RT_ARCH_AMD64
1932BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1933 PROLOGUE_2_ARGS
1934 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1935 %1 qword [A0]
1936 IEM_SAVE_FLAGS A1, %2, %3, 0
1937 EPILOGUE_2_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u64
1939
1940BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1941 PROLOGUE_2_ARGS
1942 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1943 lock %1 qword [A0]
1944 IEM_SAVE_FLAGS A1, %2, %3, 0
1945 EPILOGUE_2_ARGS
1946ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1947 %endif ; RT_ARCH_AMD64
1948
1949%endmacro
1950
1951IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1952IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1953IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1954IEMIMPL_UNARY_OP not, 0, 0
1955
1956
1957;
1958; BSWAP. No flag changes.
1959;
1960; Each function takes one argument, pointer to the value to bswap
1961; (input/output). They all return void.
1962;
1963BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1964 PROLOGUE_1_ARGS
1965 mov T0_32, [A0] ; just in case any of the upper bits are used.
1966 db 66h
1967 bswap T0_32
1968 mov [A0], T0_32
1969 EPILOGUE_1_ARGS
1970ENDPROC iemAImpl_bswap_u16
1971
1972BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1973 PROLOGUE_1_ARGS
1974 mov T0_32, [A0]
1975 bswap T0_32
1976 mov [A0], T0_32
1977 EPILOGUE_1_ARGS
1978ENDPROC iemAImpl_bswap_u32
1979
1980BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1981%ifdef RT_ARCH_AMD64
1982 PROLOGUE_1_ARGS
1983 mov T0, [A0]
1984 bswap T0
1985 mov [A0], T0
1986 EPILOGUE_1_ARGS
1987%else
1988 PROLOGUE_1_ARGS
1989 mov T0, [A0]
1990 mov T1, [A0 + 4]
1991 bswap T0
1992 bswap T1
1993 mov [A0 + 4], T0
1994 mov [A0], T1
1995 EPILOGUE_1_ARGS
1996%endif
1997ENDPROC iemAImpl_bswap_u64
1998
1999
2000;;
2001; Macro for implementing a shift operation.
2002;
2003; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2004; 32-bit system where the 64-bit accesses requires hand coding.
2005;
2006; All the functions takes a pointer to the destination memory operand in A0,
2007; the shift count in A1 and a pointer to eflags in A2.
2008;
2009; @param 1 The instruction mnemonic.
2010; @param 2 The modified flags.
2011; @param 3 The undefined flags.
2012; @param 4 Force load flags.
2013;
2014; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
2015;
2016; @note the _intel and _amd variants are implemented in C.
2017;
2018%macro IEMIMPL_SHIFT_OP 4
2019BEGINCODE
2020BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2021 PROLOGUE_3_ARGS
2022 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
2023 %ifdef ASM_CALL64_GCC
2024 mov cl, A1_8
2025 %1 byte [A0], cl
2026 %else
2027 xchg A1, A0
2028 %1 byte [A1], cl
2029 %endif
2030 IEM_SAVE_FLAGS A2, %2, %3, 0
2031.zero_shift:
2032 EPILOGUE_3_ARGS
2033ENDPROC iemAImpl_ %+ %1 %+ _u8
2034
2035BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2036 PROLOGUE_3_ARGS
2037 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
2038 %ifdef ASM_CALL64_GCC
2039 mov cl, A1_8
2040 %1 word [A0], cl
2041 %else
2042 xchg A1, A0
2043 %1 word [A1], cl
2044 %endif
2045 IEM_SAVE_FLAGS A2, %2, %3, 0
2046 EPILOGUE_3_ARGS
2047ENDPROC iemAImpl_ %+ %1 %+ _u16
2048
2049BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2050 PROLOGUE_3_ARGS
2051 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
2052 %ifdef ASM_CALL64_GCC
2053 mov cl, A1_8
2054 %1 dword [A0], cl
2055 %else
2056 xchg A1, A0
2057 %1 dword [A1], cl
2058 %endif
2059 IEM_SAVE_FLAGS A2, %2, %3, 0
2060 EPILOGUE_3_ARGS
2061ENDPROC iemAImpl_ %+ %1 %+ _u32
2062
2063 %ifdef RT_ARCH_AMD64
2064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2065 PROLOGUE_3_ARGS
2066 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
2067 %ifdef ASM_CALL64_GCC
2068 mov cl, A1_8
2069 %1 qword [A0], cl
2070 %else
2071 xchg A1, A0
2072 %1 qword [A1], cl
2073 %endif
2074 IEM_SAVE_FLAGS A2, %2, %3, 0
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u64
2077 %endif ; RT_ARCH_AMD64
2078
2079%endmacro
2080
2081; These instructions will NOT modify flags if the masked shift count is zero
2082; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2083; we have to force load all modified and undefined.
2084IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2085IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2086IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2087IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2088IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2089IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2090IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2091
2092
2093;;
2094; Macro for implementing a double precision shift operation.
2095;
2096; This will generate code for the 16, 32 and 64 bit accesses, except on
2097; 32-bit system where the 64-bit accesses requires hand coding.
2098;
2099; The functions takes the destination operand (r/m) in A0, the source (reg) in
2100; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2101;
2102; @param 1 The instruction mnemonic.
2103; @param 2 The modified flags.
2104; @param 3 The undefined flags.
2105; @param 4 The force loaded flags.
2106;
2107; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2108;
2109; @note the _intel and _amd variants are implemented in C.
2110;
2111%macro IEMIMPL_SHIFT_DBL_OP 4
2112BEGINCODE
2113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2114 PROLOGUE_4_ARGS
2115 ;IEM_LOAD_FLAGS A3, %4, %3
2116 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %4
2117 %ifdef ASM_CALL64_GCC
2118 xchg A3, A2
2119 %1 [A0], A1_16, cl
2120 xchg A3, A2
2121 %else
2122 xchg A0, A2
2123 %1 [A2], A1_16, cl
2124 %endif
2125 IEM_SAVE_FLAGS A3, %2, %3, 0
2126 EPILOGUE_4_ARGS
2127ENDPROC iemAImpl_ %+ %1 %+ _u16
2128
2129BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2130 PROLOGUE_4_ARGS
2131 ;IEM_LOAD_FLAGS A3, %4, %3
2132 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %4
2133 %ifdef ASM_CALL64_GCC
2134 xchg A3, A2
2135 %1 [A0], A1_32, cl
2136 xchg A3, A2
2137 %else
2138 xchg A0, A2
2139 %1 [A2], A1_32, cl
2140 %endif
2141 IEM_SAVE_FLAGS A3, %2, %3, 0
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_ %+ %1 %+ _u32
2144
2145 %ifdef RT_ARCH_AMD64
2146BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2147 PROLOGUE_4_ARGS
2148 ;IEM_LOAD_FLAGS A3, %4, %3
2149 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %4
2150 %ifdef ASM_CALL64_GCC
2151 xchg A3, A2
2152 %1 [A0], A1, cl
2153 xchg A3, A2
2154 %else
2155 xchg A0, A2
2156 %1 [A2], A1, cl
2157 %endif
2158 IEM_SAVE_FLAGS A3, %2, %3, 0
2159 EPILOGUE_4_ARGS_EX 12
2160ENDPROC iemAImpl_ %+ %1 %+ _u64
2161 %endif ; RT_ARCH_AMD64
2162
2163%endmacro
2164
2165; These instructions will NOT modify flags if the masked shift count is zero
2166; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2167; we have to force load all modified and undefined.
2168IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2169IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2170
2171
2172;;
2173; Macro for implementing a multiplication operations.
2174;
2175; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2176; 32-bit system where the 64-bit accesses requires hand coding.
2177;
2178; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2179; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2180; pointer to eflags in A3.
2181;
2182; The functions all return 0 so the caller can be used for div/idiv as well as
2183; for the mul/imul implementation.
2184;
2185; @param 1 The instruction mnemonic.
2186; @param 2 The modified flags.
2187; @param 3 The undefined flags.
2188; @param 4 Name suffix.
2189; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2190;
2191; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2192;
2193%macro IEMIMPL_MUL_OP 5
2194BEGINCODE
2195BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2196 PROLOGUE_3_ARGS
2197 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2198 mov al, [A0]
2199 %1 A1_8
2200 mov [A0], ax
2201 %if %5 != 1
2202 IEM_SAVE_FLAGS A2, %2, %3, 0
2203 %else
2204 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2205 %endif
2206 xor eax, eax
2207 EPILOGUE_3_ARGS
2208ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2209
2210BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2211 PROLOGUE_4_ARGS
2212 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2213 mov ax, [A0]
2214 %ifdef ASM_CALL64_GCC
2215 %1 A2_16
2216 mov [A0], ax
2217 mov [A1], dx
2218 %else
2219 mov T1, A1
2220 %1 A2_16
2221 mov [A0], ax
2222 mov [T1], dx
2223 %endif
2224 %if %5 != 1
2225 IEM_SAVE_FLAGS A3, %2, %3, 0
2226 %else
2227 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2228 %endif
2229 xor eax, eax
2230 EPILOGUE_4_ARGS
2231ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2232
2233BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2234 PROLOGUE_4_ARGS
2235 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2236 mov eax, [A0]
2237 %ifdef ASM_CALL64_GCC
2238 %1 A2_32
2239 mov [A0], eax
2240 mov [A1], edx
2241 %else
2242 mov T1, A1
2243 %1 A2_32
2244 mov [A0], eax
2245 mov [T1], edx
2246 %endif
2247 %if %5 != 1
2248 IEM_SAVE_FLAGS A3, %2, %3, 0
2249 %else
2250 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2251 %endif
2252 xor eax, eax
2253 EPILOGUE_4_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2255
2256 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2258 PROLOGUE_4_ARGS
2259 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2260 mov rax, [A0]
2261 %ifdef ASM_CALL64_GCC
2262 %1 A2
2263 mov [A0], rax
2264 mov [A1], rdx
2265 %else
2266 mov T1, A1
2267 %1 A2
2268 mov [A0], rax
2269 mov [T1], rdx
2270 %endif
2271 %if %5 != 1
2272 IEM_SAVE_FLAGS A3, %2, %3, 0
2273 %else
2274 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2275 %endif
2276 xor eax, eax
2277 EPILOGUE_4_ARGS_EX 12
2278ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2279 %endif ; !RT_ARCH_AMD64
2280
2281%endmacro
2282
2283IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2284IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2285IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2286IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2287IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2288IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2289
2290
2291BEGINCODE
2292;;
2293; Worker function for negating a 32-bit number in T1:T0
2294; @uses None (T0,T1)
2295BEGINPROC iemAImpl_negate_T0_T1_u32
2296 push 0
2297 push 0
2298 xchg T0_32, [xSP]
2299 xchg T1_32, [xSP + xCB]
2300 sub T0_32, [xSP]
2301 sbb T1_32, [xSP + xCB]
2302 add xSP, xCB*2
2303 ret
2304ENDPROC iemAImpl_negate_T0_T1_u32
2305
2306%ifdef RT_ARCH_AMD64
2307;;
2308; Worker function for negating a 64-bit number in T1:T0
2309; @uses None (T0,T1)
2310BEGINPROC iemAImpl_negate_T0_T1_u64
2311 push 0
2312 push 0
2313 xchg T0, [xSP]
2314 xchg T1, [xSP + xCB]
2315 sub T0, [xSP]
2316 sbb T1, [xSP + xCB]
2317 add xSP, xCB*2
2318 ret
2319ENDPROC iemAImpl_negate_T0_T1_u64
2320%endif
2321
2322
2323;;
2324; Macro for implementing a division operations.
2325;
2326; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2327; 32-bit system where the 64-bit accesses requires hand coding.
2328;
2329; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2330; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2331; pointer to eflags in A3.
2332;
2333; The functions all return 0 on success and -1 if a divide error should be
2334; raised by the caller.
2335;
2336; @param 1 The instruction mnemonic.
2337; @param 2 The modified flags.
2338; @param 3 The undefined flags.
2339; @param 4 1 if signed, 0 if unsigned.
2340; @param 5 Function suffix.
2341; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2342; 2 for AMD (set AF, clear PF, ZF and SF).
2343;
2344; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2345;
2346%macro IEMIMPL_DIV_OP 6
2347BEGINCODE
2348BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2349 PROLOGUE_3_ARGS
2350
2351 ; div by chainsaw check.
2352 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2353 jz .div_zero
2354
2355 ; Overflow check - unsigned division is simple to verify, haven't
2356 ; found a simple way to check signed division yet unfortunately.
2357 %if %4 == 0
2358 cmp [A0 + 1], A1_8
2359 jae .div_overflow
2360 %else
2361 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2362 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2363 test A1_8, A1_8
2364 js .divisor_negative
2365 test T0_16, T0_16
2366 jns .both_positive
2367 neg T0_16
2368.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2369 push T0 ; Start off like unsigned below.
2370 shr T0_16, 7
2371 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2372 pop T0
2373 jb .div_no_overflow
2374 ja .div_overflow
2375 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2376 cmp T0_8, A1_8
2377 jae .div_overflow
2378 jmp .div_no_overflow
2379
2380.divisor_negative:
2381 neg A1_8
2382 test T0_16, T0_16
2383 jns .one_of_each
2384 neg T0_16
2385.both_positive: ; Same as unsigned shifted by sign indicator bit.
2386 shr T0_16, 7
2387 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2388 jae .div_overflow
2389.div_no_overflow:
2390 mov A1, T1 ; restore divisor
2391 %endif
2392
2393 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2394 mov ax, [A0]
2395 %1 A1_8
2396 mov [A0], ax
2397 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2398 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2399 %else
2400 IEM_SAVE_FLAGS A2, %2, %3, 0
2401 %endif
2402 xor eax, eax
2403
2404.return:
2405 EPILOGUE_3_ARGS
2406
2407.div_zero:
2408.div_overflow:
2409 mov eax, -1
2410 jmp .return
2411ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2412
2413BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2414 PROLOGUE_4_ARGS
2415
2416 ; div by chainsaw check.
2417 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2418 jz .div_zero
2419
2420 ; Overflow check - unsigned division is simple to verify, haven't
2421 ; found a simple way to check signed division yet unfortunately.
2422 %if %4 == 0
2423 cmp [A1], A2_16
2424 jae .div_overflow
2425 %else
2426 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2427 shl T0_32, 16
2428 mov T0_16, [A0] ; T0 = dividend
2429 mov T1, A2 ; T1 = divisor
2430 test T1_16, T1_16
2431 js .divisor_negative
2432 test T0_32, T0_32
2433 jns .both_positive
2434 neg T0_32
2435.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2436 push T0 ; Start off like unsigned below.
2437 shr T0_32, 15
2438 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2439 pop T0
2440 jb .div_no_overflow
2441 ja .div_overflow
2442 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2443 cmp T0_16, T1_16
2444 jae .div_overflow
2445 jmp .div_no_overflow
2446
2447.divisor_negative:
2448 neg T1_16
2449 test T0_32, T0_32
2450 jns .one_of_each
2451 neg T0_32
2452.both_positive: ; Same as unsigned shifted by sign indicator bit.
2453 shr T0_32, 15
2454 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2455 jae .div_overflow
2456.div_no_overflow:
2457 %endif
2458
2459 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2460 %ifdef ASM_CALL64_GCC
2461 mov T1, A2
2462 mov ax, [A0]
2463 mov dx, [A1]
2464 %1 T1_16
2465 mov [A0], ax
2466 mov [A1], dx
2467 %else
2468 mov T1, A1
2469 mov ax, [A0]
2470 mov dx, [T1]
2471 %1 A2_16
2472 mov [A0], ax
2473 mov [T1], dx
2474 %endif
2475 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2476 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2477 %else
2478 IEM_SAVE_FLAGS A3, %2, %3, 0
2479 %endif
2480 xor eax, eax
2481
2482.return:
2483 EPILOGUE_4_ARGS
2484
2485.div_zero:
2486.div_overflow:
2487 mov eax, -1
2488 jmp .return
2489ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2490
2491BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2492 PROLOGUE_4_ARGS
2493
2494 ; div by chainsaw check.
2495 test A2_32, A2_32
2496 jz .div_zero
2497
2498 ; Overflow check - unsigned division is simple to verify, haven't
2499 ; found a simple way to check signed division yet unfortunately.
2500 %if %4 == 0
2501 cmp [A1], A2_32
2502 jae .div_overflow
2503 %else
2504 push A2 ; save A2 so we modify it (we out of regs on x86).
2505 mov T0_32, [A0] ; T0 = dividend low
2506 mov T1_32, [A1] ; T1 = dividend high
2507 ;test A2_32, A2_32 - we did this 5 instructions ago.
2508 js .divisor_negative
2509 test T1_32, T1_32
2510 jns .both_positive
2511 call NAME(iemAImpl_negate_T0_T1_u32)
2512.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2513 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2514 jnz .div_overflow
2515 push T0 ; Start off like unsigned below.
2516 shl T1_32, 1
2517 shr T0_32, 31
2518 or T1_32, T0_32
2519 cmp T1_32, A2_32
2520 pop T0
2521 jb .div_no_overflow
2522 ja .div_overflow
2523 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2524 cmp T0_32, A2_32
2525 jae .div_overflow
2526 jmp .div_no_overflow
2527
2528.divisor_negative:
2529 neg A2_32
2530 test T1_32, T1_32
2531 jns .one_of_each
2532 call NAME(iemAImpl_negate_T0_T1_u32)
2533.both_positive: ; Same as unsigned shifted by sign indicator bit.
2534 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2535 jnz .div_overflow
2536 shl T1_32, 1
2537 shr T0_32, 31
2538 or T1_32, T0_32
2539 cmp T1_32, A2_32
2540 jae .div_overflow
2541.div_no_overflow:
2542 pop A2
2543 %endif
2544
2545 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2546 mov eax, [A0]
2547 %ifdef ASM_CALL64_GCC
2548 mov T1, A2
2549 mov eax, [A0]
2550 mov edx, [A1]
2551 %1 T1_32
2552 mov [A0], eax
2553 mov [A1], edx
2554 %else
2555 mov T1, A1
2556 mov eax, [A0]
2557 mov edx, [T1]
2558 %1 A2_32
2559 mov [A0], eax
2560 mov [T1], edx
2561 %endif
2562 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2563 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2564 %else
2565 IEM_SAVE_FLAGS A3, %2, %3, 0
2566 %endif
2567 xor eax, eax
2568
2569.return:
2570 EPILOGUE_4_ARGS
2571
2572.div_overflow:
2573 %if %4 != 0
2574 pop A2
2575 %endif
2576.div_zero:
2577 mov eax, -1
2578 jmp .return
2579ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2580
2581 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2583 PROLOGUE_4_ARGS
2584
2585 test A2, A2
2586 jz .div_zero
2587 %if %4 == 0
2588 cmp [A1], A2
2589 jae .div_overflow
2590 %else
2591 push A2 ; save A2 so we modify it (we out of regs on x86).
2592 mov T0, [A0] ; T0 = dividend low
2593 mov T1, [A1] ; T1 = dividend high
2594 ;test A2, A2 - we did this five instructions above.
2595 js .divisor_negative
2596 test T1, T1
2597 jns .both_positive
2598 call NAME(iemAImpl_negate_T0_T1_u64)
2599.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2600 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2601 jc .div_overflow
2602 push T0 ; Start off like unsigned below.
2603 shl T1, 1
2604 shr T0, 63
2605 or T1, T0
2606 cmp T1, A2
2607 pop T0
2608 jb .div_no_overflow
2609 ja .div_overflow
2610 mov T1, 0x7fffffffffffffff
2611 and T0, T1 ; Special case for covering (divisor - 1).
2612 cmp T0, A2
2613 jae .div_overflow
2614 jmp .div_no_overflow
2615
2616.divisor_negative:
2617 neg A2
2618 test T1, T1
2619 jns .one_of_each
2620 call NAME(iemAImpl_negate_T0_T1_u64)
2621.both_positive: ; Same as unsigned shifted by sign indicator bit.
2622 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2623 jc .div_overflow
2624 shl T1, 1
2625 shr T0, 63
2626 or T1, T0
2627 cmp T1, A2
2628 jae .div_overflow
2629.div_no_overflow:
2630 pop A2
2631 %endif
2632
2633 IEM_MAYBE_LOAD_FLAGS A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2634 mov rax, [A0]
2635 %ifdef ASM_CALL64_GCC
2636 mov T1, A2
2637 mov rax, [A0]
2638 mov rdx, [A1]
2639 %1 T1
2640 mov [A0], rax
2641 mov [A1], rdx
2642 %else
2643 mov T1, A1
2644 mov rax, [A0]
2645 mov rdx, [T1]
2646 %1 A2
2647 mov [A0], rax
2648 mov [T1], rdx
2649 %endif
2650 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2651 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2652 %else
2653 IEM_SAVE_FLAGS A3, %2, %3, 0
2654 %endif
2655 xor eax, eax
2656
2657.return:
2658 EPILOGUE_4_ARGS_EX 12
2659
2660.div_overflow:
2661 %if %4 != 0
2662 pop A2
2663 %endif
2664.div_zero:
2665 mov eax, -1
2666 jmp .return
2667ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2668 %endif ; !RT_ARCH_AMD64
2669
2670%endmacro
2671
2672IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2673IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2674IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2675;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
2676IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2677IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2678IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2679
2680
2681;;
2682; Macro for implementing memory fence operation.
2683;
2684; No return value, no operands or anything.
2685;
2686; @param 1 The instruction.
2687;
2688%macro IEMIMPL_MEM_FENCE 1
2689BEGINCODE
2690BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2691 %1
2692 ret
2693ENDPROC iemAImpl_ %+ %1
2694%endmacro
2695
2696IEMIMPL_MEM_FENCE lfence
2697IEMIMPL_MEM_FENCE sfence
2698IEMIMPL_MEM_FENCE mfence
2699
2700;;
2701; Alternative for non-SSE2 host.
2702;
2703BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2704 push xAX
2705 xchg xAX, [xSP]
2706 add xSP, xCB
2707 ret
2708ENDPROC iemAImpl_alt_mem_fence
2709
2710
2711;;
2712; Initialize the FPU for the actual instruction being emulated, this means
2713; loading parts of the guest's control word and status word.
2714;
2715; @uses 24 bytes of stack. T0, T1
2716; @param 1 Expression giving the address of the FXSTATE of the guest.
2717;
2718%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2719 fnstenv [xSP]
2720
2721 ; FCW - for exception, precision and rounding control.
2722 movzx T0, word [%1 + X86FXSTATE.FCW]
2723 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2724 mov [xSP + X86FSTENV32P.FCW], T0_16
2725
2726 ; FSW - for undefined C0, C1, C2, and C3.
2727 movzx T1, word [%1 + X86FXSTATE.FSW]
2728 and T1, X86_FSW_C_MASK
2729 movzx T0, word [xSP + X86FSTENV32P.FSW]
2730 and T0, X86_FSW_TOP_MASK
2731 or T0, T1
2732 mov [xSP + X86FSTENV32P.FSW], T0_16
2733
2734 fldenv [xSP]
2735%endmacro
2736
2737
2738;;
2739; Initialize the FPU for the actual instruction being emulated, this means
2740; loading parts of the guest's control word, status word, and update the
2741; tag word for the top register if it's empty.
2742;
2743; ASSUMES actual TOP=7
2744;
2745; @uses 24 bytes of stack. T0, T1
2746; @param 1 Expression giving the address of the FXSTATE of the guest.
2747;
2748%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2749 fnstenv [xSP]
2750
2751 ; FCW - for exception, precision and rounding control.
2752 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2753 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2754 mov [xSP + X86FSTENV32P.FCW], T0_16
2755
2756 ; FSW - for undefined C0, C1, C2, and C3.
2757 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2758 and T1_32, X86_FSW_C_MASK
2759 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2760 and T0_32, X86_FSW_TOP_MASK
2761 or T0_32, T1_32
2762 mov [xSP + X86FSTENV32P.FSW], T0_16
2763
2764 ; FTW - Only for ST0 (in/out).
2765 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2766 shr T1_32, X86_FSW_TOP_SHIFT
2767 and T1_32, X86_FSW_TOP_SMASK
2768 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2769 jc %%st0_not_empty
2770 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2771%%st0_not_empty:
2772
2773 fldenv [xSP]
2774%endmacro
2775
2776
2777;;
2778; Need to move this as well somewhere better?
2779;
2780struc IEMFPURESULT
2781 .r80Result resw 5
2782 .FSW resw 1
2783endstruc
2784
2785
2786;;
2787; Need to move this as well somewhere better?
2788;
2789struc IEMFPURESULTTWO
2790 .r80Result1 resw 5
2791 .FSW resw 1
2792 .r80Result2 resw 5
2793endstruc
2794
2795
2796;
2797;---------------------- 16-bit signed integer operations ----------------------
2798;
2799
2800
2801;;
2802; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2803;
2804; @param A0 FPU context (fxsave).
2805; @param A1 Pointer to a IEMFPURESULT for the output.
2806; @param A2 Pointer to the 16-bit floating point value to convert.
2807;
2808BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2809 PROLOGUE_3_ARGS
2810 sub xSP, 20h
2811
2812 fninit
2813 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2814 fild word [A2]
2815
2816 fnstsw word [A1 + IEMFPURESULT.FSW]
2817 fnclex
2818 fstp tword [A1 + IEMFPURESULT.r80Result]
2819
2820 fninit
2821 add xSP, 20h
2822 EPILOGUE_3_ARGS
2823ENDPROC iemAImpl_fild_r80_from_i16
2824
2825
2826;;
2827; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2828;
2829; @param A0 FPU context (fxsave).
2830; @param A1 Where to return the output FSW.
2831; @param A2 Where to store the 16-bit signed integer value.
2832; @param A3 Pointer to the 80-bit value.
2833;
2834BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2835 PROLOGUE_4_ARGS
2836 sub xSP, 20h
2837
2838 fninit
2839 fld tword [A3]
2840 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2841 fistp word [A2]
2842
2843 fnstsw word [A1]
2844
2845 fninit
2846 add xSP, 20h
2847 EPILOGUE_4_ARGS
2848ENDPROC iemAImpl_fist_r80_to_i16
2849
2850
2851;;
2852; Store a 80-bit floating point value (register) as a 16-bit signed integer
2853; (memory) with truncation.
2854;
2855; @param A0 FPU context (fxsave).
2856; @param A1 Where to return the output FSW.
2857; @param A2 Where to store the 16-bit signed integer value.
2858; @param A3 Pointer to the 80-bit value.
2859;
2860BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2861 PROLOGUE_4_ARGS
2862 sub xSP, 20h
2863
2864 fninit
2865 fld tword [A3]
2866 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2867 fisttp word [A2]
2868
2869 fnstsw word [A1]
2870
2871 fninit
2872 add xSP, 20h
2873 EPILOGUE_4_ARGS
2874ENDPROC iemAImpl_fistt_r80_to_i16
2875
2876
2877;;
2878; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2879;
2880; @param 1 The instruction
2881;
2882; @param A0 FPU context (fxsave).
2883; @param A1 Pointer to a IEMFPURESULT for the output.
2884; @param A2 Pointer to the 80-bit value.
2885; @param A3 Pointer to the 16-bit value.
2886;
2887%macro IEMIMPL_FPU_R80_BY_I16 1
2888BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2889 PROLOGUE_4_ARGS
2890 sub xSP, 20h
2891
2892 fninit
2893 fld tword [A2]
2894 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2895 %1 word [A3]
2896
2897 fnstsw word [A1 + IEMFPURESULT.FSW]
2898 fnclex
2899 fstp tword [A1 + IEMFPURESULT.r80Result]
2900
2901 fninit
2902 add xSP, 20h
2903 EPILOGUE_4_ARGS
2904ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2905%endmacro
2906
2907IEMIMPL_FPU_R80_BY_I16 fiadd
2908IEMIMPL_FPU_R80_BY_I16 fimul
2909IEMIMPL_FPU_R80_BY_I16 fisub
2910IEMIMPL_FPU_R80_BY_I16 fisubr
2911IEMIMPL_FPU_R80_BY_I16 fidiv
2912IEMIMPL_FPU_R80_BY_I16 fidivr
2913
2914
2915;;
2916; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2917; only returning FSW.
2918;
2919; @param 1 The instruction
2920;
2921; @param A0 FPU context (fxsave).
2922; @param A1 Where to store the output FSW.
2923; @param A2 Pointer to the 80-bit value.
2924; @param A3 Pointer to the 64-bit value.
2925;
2926%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2927BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2928 PROLOGUE_4_ARGS
2929 sub xSP, 20h
2930
2931 fninit
2932 fld tword [A2]
2933 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2934 %1 word [A3]
2935
2936 fnstsw word [A1]
2937
2938 fninit
2939 add xSP, 20h
2940 EPILOGUE_4_ARGS
2941ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2942%endmacro
2943
2944IEMIMPL_FPU_R80_BY_I16_FSW ficom
2945
2946
2947
2948;
2949;---------------------- 32-bit signed integer operations ----------------------
2950;
2951
2952
2953;;
2954; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2955;
2956; @param A0 FPU context (fxsave).
2957; @param A1 Pointer to a IEMFPURESULT for the output.
2958; @param A2 Pointer to the 32-bit floating point value to convert.
2959;
2960BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2961 PROLOGUE_3_ARGS
2962 sub xSP, 20h
2963
2964 fninit
2965 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2966 fild dword [A2]
2967
2968 fnstsw word [A1 + IEMFPURESULT.FSW]
2969 fnclex
2970 fstp tword [A1 + IEMFPURESULT.r80Result]
2971
2972 fninit
2973 add xSP, 20h
2974 EPILOGUE_3_ARGS
2975ENDPROC iemAImpl_fild_r80_from_i32
2976
2977
2978;;
2979; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2980;
2981; @param A0 FPU context (fxsave).
2982; @param A1 Where to return the output FSW.
2983; @param A2 Where to store the 32-bit signed integer value.
2984; @param A3 Pointer to the 80-bit value.
2985;
2986BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2987 PROLOGUE_4_ARGS
2988 sub xSP, 20h
2989
2990 fninit
2991 fld tword [A3]
2992 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2993 fistp dword [A2]
2994
2995 fnstsw word [A1]
2996
2997 fninit
2998 add xSP, 20h
2999 EPILOGUE_4_ARGS
3000ENDPROC iemAImpl_fist_r80_to_i32
3001
3002
3003;;
3004; Store a 80-bit floating point value (register) as a 32-bit signed integer
3005; (memory) with truncation.
3006;
3007; @param A0 FPU context (fxsave).
3008; @param A1 Where to return the output FSW.
3009; @param A2 Where to store the 32-bit signed integer value.
3010; @param A3 Pointer to the 80-bit value.
3011;
3012BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3013 PROLOGUE_4_ARGS
3014 sub xSP, 20h
3015
3016 fninit
3017 fld tword [A3]
3018 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3019 fisttp dword [A2]
3020
3021 fnstsw word [A1]
3022
3023 fninit
3024 add xSP, 20h
3025 EPILOGUE_4_ARGS
3026ENDPROC iemAImpl_fistt_r80_to_i32
3027
3028
3029;;
3030; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3031;
3032; @param 1 The instruction
3033;
3034; @param A0 FPU context (fxsave).
3035; @param A1 Pointer to a IEMFPURESULT for the output.
3036; @param A2 Pointer to the 80-bit value.
3037; @param A3 Pointer to the 32-bit value.
3038;
3039%macro IEMIMPL_FPU_R80_BY_I32 1
3040BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3041 PROLOGUE_4_ARGS
3042 sub xSP, 20h
3043
3044 fninit
3045 fld tword [A2]
3046 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3047 %1 dword [A3]
3048
3049 fnstsw word [A1 + IEMFPURESULT.FSW]
3050 fnclex
3051 fstp tword [A1 + IEMFPURESULT.r80Result]
3052
3053 fninit
3054 add xSP, 20h
3055 EPILOGUE_4_ARGS
3056ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3057%endmacro
3058
3059IEMIMPL_FPU_R80_BY_I32 fiadd
3060IEMIMPL_FPU_R80_BY_I32 fimul
3061IEMIMPL_FPU_R80_BY_I32 fisub
3062IEMIMPL_FPU_R80_BY_I32 fisubr
3063IEMIMPL_FPU_R80_BY_I32 fidiv
3064IEMIMPL_FPU_R80_BY_I32 fidivr
3065
3066
3067;;
3068; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3069; only returning FSW.
3070;
3071; @param 1 The instruction
3072;
3073; @param A0 FPU context (fxsave).
3074; @param A1 Where to store the output FSW.
3075; @param A2 Pointer to the 80-bit value.
3076; @param A3 Pointer to the 64-bit value.
3077;
3078%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3079BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3080 PROLOGUE_4_ARGS
3081 sub xSP, 20h
3082
3083 fninit
3084 fld tword [A2]
3085 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3086 %1 dword [A3]
3087
3088 fnstsw word [A1]
3089
3090 fninit
3091 add xSP, 20h
3092 EPILOGUE_4_ARGS
3093ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3094%endmacro
3095
3096IEMIMPL_FPU_R80_BY_I32_FSW ficom
3097
3098
3099
3100;
3101;---------------------- 64-bit signed integer operations ----------------------
3102;
3103
3104
3105;;
3106; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3107;
3108; @param A0 FPU context (fxsave).
3109; @param A1 Pointer to a IEMFPURESULT for the output.
3110; @param A2 Pointer to the 64-bit floating point value to convert.
3111;
3112BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3113 PROLOGUE_3_ARGS
3114 sub xSP, 20h
3115
3116 fninit
3117 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3118 fild qword [A2]
3119
3120 fnstsw word [A1 + IEMFPURESULT.FSW]
3121 fnclex
3122 fstp tword [A1 + IEMFPURESULT.r80Result]
3123
3124 fninit
3125 add xSP, 20h
3126 EPILOGUE_3_ARGS
3127ENDPROC iemAImpl_fild_r80_from_i64
3128
3129
3130;;
3131; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3132;
3133; @param A0 FPU context (fxsave).
3134; @param A1 Where to return the output FSW.
3135; @param A2 Where to store the 64-bit signed integer value.
3136; @param A3 Pointer to the 80-bit value.
3137;
3138BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3139 PROLOGUE_4_ARGS
3140 sub xSP, 20h
3141
3142 fninit
3143 fld tword [A3]
3144 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3145 fistp qword [A2]
3146
3147 fnstsw word [A1]
3148
3149 fninit
3150 add xSP, 20h
3151 EPILOGUE_4_ARGS
3152ENDPROC iemAImpl_fist_r80_to_i64
3153
3154
3155;;
3156; Store a 80-bit floating point value (register) as a 64-bit signed integer
3157; (memory) with truncation.
3158;
3159; @param A0 FPU context (fxsave).
3160; @param A1 Where to return the output FSW.
3161; @param A2 Where to store the 64-bit signed integer value.
3162; @param A3 Pointer to the 80-bit value.
3163;
3164BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3165 PROLOGUE_4_ARGS
3166 sub xSP, 20h
3167
3168 fninit
3169 fld tword [A3]
3170 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3171 fisttp qword [A2]
3172
3173 fnstsw word [A1]
3174
3175 fninit
3176 add xSP, 20h
3177 EPILOGUE_4_ARGS
3178ENDPROC iemAImpl_fistt_r80_to_i64
3179
3180
3181
3182;
3183;---------------------- 32-bit floating point operations ----------------------
3184;
3185
3186;;
3187; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Pointer to a IEMFPURESULT for the output.
3191; @param A2 Pointer to the 32-bit floating point value to convert.
3192;
3193BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3194 PROLOGUE_3_ARGS
3195 sub xSP, 20h
3196
3197 fninit
3198 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3199 fld dword [A2]
3200
3201 fnstsw word [A1 + IEMFPURESULT.FSW]
3202 fnclex
3203 fstp tword [A1 + IEMFPURESULT.r80Result]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_3_ARGS
3208ENDPROC iemAImpl_fld_r80_from_r32
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3213;
3214; @param A0 FPU context (fxsave).
3215; @param A1 Where to return the output FSW.
3216; @param A2 Where to store the 32-bit value.
3217; @param A3 Pointer to the 80-bit value.
3218;
3219BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3220 PROLOGUE_4_ARGS
3221 sub xSP, 20h
3222
3223 fninit
3224 fld tword [A3]
3225 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3226 fst dword [A2]
3227
3228 fnstsw word [A1]
3229
3230 fninit
3231 add xSP, 20h
3232 EPILOGUE_4_ARGS
3233ENDPROC iemAImpl_fst_r80_to_r32
3234
3235
3236;;
3237; FPU instruction working on one 80-bit and one 32-bit floating point value.
3238;
3239; @param 1 The instruction
3240;
3241; @param A0 FPU context (fxsave).
3242; @param A1 Pointer to a IEMFPURESULT for the output.
3243; @param A2 Pointer to the 80-bit value.
3244; @param A3 Pointer to the 32-bit value.
3245;
3246%macro IEMIMPL_FPU_R80_BY_R32 1
3247BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3248 PROLOGUE_4_ARGS
3249 sub xSP, 20h
3250
3251 fninit
3252 fld tword [A2]
3253 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3254 %1 dword [A3]
3255
3256 fnstsw word [A1 + IEMFPURESULT.FSW]
3257 fnclex
3258 fstp tword [A1 + IEMFPURESULT.r80Result]
3259
3260 fninit
3261 add xSP, 20h
3262 EPILOGUE_4_ARGS
3263ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3264%endmacro
3265
3266IEMIMPL_FPU_R80_BY_R32 fadd
3267IEMIMPL_FPU_R80_BY_R32 fmul
3268IEMIMPL_FPU_R80_BY_R32 fsub
3269IEMIMPL_FPU_R80_BY_R32 fsubr
3270IEMIMPL_FPU_R80_BY_R32 fdiv
3271IEMIMPL_FPU_R80_BY_R32 fdivr
3272
3273
3274;;
3275; FPU instruction working on one 80-bit and one 32-bit floating point value,
3276; only returning FSW.
3277;
3278; @param 1 The instruction
3279;
3280; @param A0 FPU context (fxsave).
3281; @param A1 Where to store the output FSW.
3282; @param A2 Pointer to the 80-bit value.
3283; @param A3 Pointer to the 64-bit value.
3284;
3285%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3286BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3287 PROLOGUE_4_ARGS
3288 sub xSP, 20h
3289
3290 fninit
3291 fld tword [A2]
3292 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3293 %1 dword [A3]
3294
3295 fnstsw word [A1]
3296
3297 fninit
3298 add xSP, 20h
3299 EPILOGUE_4_ARGS
3300ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3301%endmacro
3302
3303IEMIMPL_FPU_R80_BY_R32_FSW fcom
3304
3305
3306
3307;
3308;---------------------- 64-bit floating point operations ----------------------
3309;
3310
3311;;
3312; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3313;
3314; @param A0 FPU context (fxsave).
3315; @param A1 Pointer to a IEMFPURESULT for the output.
3316; @param A2 Pointer to the 64-bit floating point value to convert.
3317;
3318BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3319 PROLOGUE_3_ARGS
3320 sub xSP, 20h
3321
3322 fninit
3323 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3324 fld qword [A2]
3325
3326 fnstsw word [A1 + IEMFPURESULT.FSW]
3327 fnclex
3328 fstp tword [A1 + IEMFPURESULT.r80Result]
3329
3330 fninit
3331 add xSP, 20h
3332 EPILOGUE_3_ARGS
3333ENDPROC iemAImpl_fld_r80_from_r64
3334
3335
3336;;
3337; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3338;
3339; @param A0 FPU context (fxsave).
3340; @param A1 Where to return the output FSW.
3341; @param A2 Where to store the 64-bit value.
3342; @param A3 Pointer to the 80-bit value.
3343;
3344BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3345 PROLOGUE_4_ARGS
3346 sub xSP, 20h
3347
3348 fninit
3349 fld tword [A3]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 fst qword [A2]
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_fst_r80_to_r64
3359
3360
3361;;
3362; FPU instruction working on one 80-bit and one 64-bit floating point value.
3363;
3364; @param 1 The instruction
3365;
3366; @param A0 FPU context (fxsave).
3367; @param A1 Pointer to a IEMFPURESULT for the output.
3368; @param A2 Pointer to the 80-bit value.
3369; @param A3 Pointer to the 64-bit value.
3370;
3371%macro IEMIMPL_FPU_R80_BY_R64 1
3372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A2]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 %1 qword [A3]
3380
3381 fnstsw word [A1 + IEMFPURESULT.FSW]
3382 fnclex
3383 fstp tword [A1 + IEMFPURESULT.r80Result]
3384
3385 fninit
3386 add xSP, 20h
3387 EPILOGUE_4_ARGS
3388ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3389%endmacro
3390
3391IEMIMPL_FPU_R80_BY_R64 fadd
3392IEMIMPL_FPU_R80_BY_R64 fmul
3393IEMIMPL_FPU_R80_BY_R64 fsub
3394IEMIMPL_FPU_R80_BY_R64 fsubr
3395IEMIMPL_FPU_R80_BY_R64 fdiv
3396IEMIMPL_FPU_R80_BY_R64 fdivr
3397
3398;;
3399; FPU instruction working on one 80-bit and one 64-bit floating point value,
3400; only returning FSW.
3401;
3402; @param 1 The instruction
3403;
3404; @param A0 FPU context (fxsave).
3405; @param A1 Where to store the output FSW.
3406; @param A2 Pointer to the 80-bit value.
3407; @param A3 Pointer to the 64-bit value.
3408;
3409%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3410BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3411 PROLOGUE_4_ARGS
3412 sub xSP, 20h
3413
3414 fninit
3415 fld tword [A2]
3416 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3417 %1 qword [A3]
3418
3419 fnstsw word [A1]
3420
3421 fninit
3422 add xSP, 20h
3423 EPILOGUE_4_ARGS
3424ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3425%endmacro
3426
3427IEMIMPL_FPU_R80_BY_R64_FSW fcom
3428
3429
3430
3431;
3432;---------------------- 80-bit floating point operations ----------------------
3433;
3434
3435;;
3436; Loads a 80-bit floating point register value from memory.
3437;
3438; @param A0 FPU context (fxsave).
3439; @param A1 Pointer to a IEMFPURESULT for the output.
3440; @param A2 Pointer to the 80-bit floating point value to load.
3441;
3442BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3443 PROLOGUE_3_ARGS
3444 sub xSP, 20h
3445
3446 fninit
3447 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3448 fld tword [A2]
3449
3450 fnstsw word [A1 + IEMFPURESULT.FSW]
3451 fnclex
3452 fstp tword [A1 + IEMFPURESULT.r80Result]
3453
3454 fninit
3455 add xSP, 20h
3456 EPILOGUE_3_ARGS
3457ENDPROC iemAImpl_fld_r80_from_r80
3458
3459
3460;;
3461; Store a 80-bit floating point register to memory
3462;
3463; @param A0 FPU context (fxsave).
3464; @param A1 Where to return the output FSW.
3465; @param A2 Where to store the 80-bit value.
3466; @param A3 Pointer to the 80-bit register value.
3467;
3468BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3469 PROLOGUE_4_ARGS
3470 sub xSP, 20h
3471
3472 fninit
3473 fld tword [A3]
3474 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3475 fstp tword [A2]
3476
3477 fnstsw word [A1]
3478
3479 fninit
3480 add xSP, 20h
3481 EPILOGUE_4_ARGS
3482ENDPROC iemAImpl_fst_r80_to_r80
3483
3484
3485;;
3486; Loads an 80-bit floating point register value in BCD format from memory.
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Pointer to a IEMFPURESULT for the output.
3490; @param A2 Pointer to the 80-bit BCD value to load.
3491;
3492BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3493 PROLOGUE_3_ARGS
3494 sub xSP, 20h
3495
3496 fninit
3497 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3498 fbld tword [A2]
3499
3500 fnstsw word [A1 + IEMFPURESULT.FSW]
3501 fnclex
3502 fstp tword [A1 + IEMFPURESULT.r80Result]
3503
3504 fninit
3505 add xSP, 20h
3506 EPILOGUE_3_ARGS
3507ENDPROC iemAImpl_fld_r80_from_d80
3508
3509
3510;;
3511; Store a 80-bit floating point register to memory as BCD
3512;
3513; @param A0 FPU context (fxsave).
3514; @param A1 Where to return the output FSW.
3515; @param A2 Where to store the 80-bit BCD value.
3516; @param A3 Pointer to the 80-bit register value.
3517;
3518BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3519 PROLOGUE_4_ARGS
3520 sub xSP, 20h
3521
3522 fninit
3523 fld tword [A3]
3524 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3525 fbstp tword [A2]
3526
3527 fnstsw word [A1]
3528
3529 fninit
3530 add xSP, 20h
3531 EPILOGUE_4_ARGS
3532ENDPROC iemAImpl_fst_r80_to_d80
3533
3534
3535;;
3536; FPU instruction working on two 80-bit floating point values.
3537;
3538; @param 1 The instruction
3539;
3540; @param A0 FPU context (fxsave).
3541; @param A1 Pointer to a IEMFPURESULT for the output.
3542; @param A2 Pointer to the first 80-bit value (ST0)
3543; @param A3 Pointer to the second 80-bit value (STn).
3544;
3545%macro IEMIMPL_FPU_R80_BY_R80 2
3546BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3547 PROLOGUE_4_ARGS
3548 sub xSP, 20h
3549
3550 fninit
3551 fld tword [A3]
3552 fld tword [A2]
3553 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3554 %1 %2
3555
3556 fnstsw word [A1 + IEMFPURESULT.FSW]
3557 fnclex
3558 fstp tword [A1 + IEMFPURESULT.r80Result]
3559
3560 fninit
3561 add xSP, 20h
3562 EPILOGUE_4_ARGS
3563ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3564%endmacro
3565
3566IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3567IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3568IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3569IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3570IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3571IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3572IEMIMPL_FPU_R80_BY_R80 fprem, {}
3573IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3574IEMIMPL_FPU_R80_BY_R80 fscale, {}
3575
3576
3577;;
3578; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3579; storing the result in ST1 and popping the stack.
3580;
3581; @param 1 The instruction
3582;
3583; @param A0 FPU context (fxsave).
3584; @param A1 Pointer to a IEMFPURESULT for the output.
3585; @param A2 Pointer to the first 80-bit value (ST1).
3586; @param A3 Pointer to the second 80-bit value (ST0).
3587;
3588%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3590 PROLOGUE_4_ARGS
3591 sub xSP, 20h
3592
3593 fninit
3594 fld tword [A2]
3595 fld tword [A3]
3596 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3597 %1
3598
3599 fnstsw word [A1 + IEMFPURESULT.FSW]
3600 fnclex
3601 fstp tword [A1 + IEMFPURESULT.r80Result]
3602
3603 fninit
3604 add xSP, 20h
3605 EPILOGUE_4_ARGS
3606ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3607%endmacro
3608
3609IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3610IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3611IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3612
3613
3614;;
3615; FPU instruction working on two 80-bit floating point values, only
3616; returning FSW.
3617;
3618; @param 1 The instruction
3619;
3620; @param A0 FPU context (fxsave).
3621; @param A1 Pointer to a uint16_t for the resulting FSW.
3622; @param A2 Pointer to the first 80-bit value.
3623; @param A3 Pointer to the second 80-bit value.
3624;
3625%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3626BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3627 PROLOGUE_4_ARGS
3628 sub xSP, 20h
3629
3630 fninit
3631 fld tword [A3]
3632 fld tword [A2]
3633 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3634 %1 st0, st1
3635
3636 fnstsw word [A1]
3637
3638 fninit
3639 add xSP, 20h
3640 EPILOGUE_4_ARGS
3641ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3642%endmacro
3643
3644IEMIMPL_FPU_R80_BY_R80_FSW fcom
3645IEMIMPL_FPU_R80_BY_R80_FSW fucom
3646
3647
3648;;
3649; FPU instruction working on two 80-bit floating point values,
3650; returning FSW and EFLAGS (eax).
3651;
3652; @param 1 The instruction
3653;
3654; @returns EFLAGS in EAX.
3655; @param A0 FPU context (fxsave).
3656; @param A1 Pointer to a uint16_t for the resulting FSW.
3657; @param A2 Pointer to the first 80-bit value.
3658; @param A3 Pointer to the second 80-bit value.
3659;
3660%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3662 PROLOGUE_4_ARGS
3663 sub xSP, 20h
3664
3665 fninit
3666 fld tword [A3]
3667 fld tword [A2]
3668 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3669 %1 st1
3670
3671 fnstsw word [A1]
3672 pushf
3673 pop xAX
3674
3675 fninit
3676 add xSP, 20h
3677 EPILOGUE_4_ARGS
3678ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3679%endmacro
3680
3681IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3682IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3683
3684
3685;;
3686; FPU instruction working on one 80-bit floating point value.
3687;
3688; @param 1 The instruction
3689;
3690; @param A0 FPU context (fxsave).
3691; @param A1 Pointer to a IEMFPURESULT for the output.
3692; @param A2 Pointer to the 80-bit value.
3693;
3694%macro IEMIMPL_FPU_R80 1
3695BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3696 PROLOGUE_3_ARGS
3697 sub xSP, 20h
3698
3699 fninit
3700 fld tword [A2]
3701 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3702 %1
3703
3704 fnstsw word [A1 + IEMFPURESULT.FSW]
3705 fnclex
3706 fstp tword [A1 + IEMFPURESULT.r80Result]
3707
3708 fninit
3709 add xSP, 20h
3710 EPILOGUE_3_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _r80
3712%endmacro
3713
3714IEMIMPL_FPU_R80 fchs
3715IEMIMPL_FPU_R80 fabs
3716IEMIMPL_FPU_R80 f2xm1
3717IEMIMPL_FPU_R80 fsqrt
3718IEMIMPL_FPU_R80 frndint
3719IEMIMPL_FPU_R80 fsin
3720IEMIMPL_FPU_R80 fcos
3721
3722
3723;;
3724; FPU instruction working on one 80-bit floating point value, only
3725; returning FSW.
3726;
3727; @param 1 The instruction
3728; @param 2 Non-zero to also restore FTW.
3729;
3730; @param A0 FPU context (fxsave).
3731; @param A1 Pointer to a uint16_t for the resulting FSW.
3732; @param A2 Pointer to the 80-bit value.
3733;
3734%macro IEMIMPL_FPU_R80_FSW 2
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3736 PROLOGUE_3_ARGS
3737 sub xSP, 20h
3738
3739 fninit
3740 fld tword [A2]
3741%if %2 != 0
3742 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3743%else
3744 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3745%endif
3746 %1
3747
3748 fnstsw word [A1]
3749
3750 fninit
3751 add xSP, 20h
3752 EPILOGUE_3_ARGS
3753ENDPROC iemAImpl_ %+ %1 %+ _r80
3754%endmacro
3755
3756IEMIMPL_FPU_R80_FSW ftst, 0
3757IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3758
3759
3760
3761;;
3762; FPU instruction loading a 80-bit floating point constant.
3763;
3764; @param 1 The instruction
3765;
3766; @param A0 FPU context (fxsave).
3767; @param A1 Pointer to a IEMFPURESULT for the output.
3768;
3769%macro IEMIMPL_FPU_R80_CONST 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3771 PROLOGUE_2_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3776 %1
3777
3778 fnstsw word [A1 + IEMFPURESULT.FSW]
3779 fnclex
3780 fstp tword [A1 + IEMFPURESULT.r80Result]
3781
3782 fninit
3783 add xSP, 20h
3784 EPILOGUE_2_ARGS
3785ENDPROC iemAImpl_ %+ %1 %+
3786%endmacro
3787
3788IEMIMPL_FPU_R80_CONST fld1
3789IEMIMPL_FPU_R80_CONST fldl2t
3790IEMIMPL_FPU_R80_CONST fldl2e
3791IEMIMPL_FPU_R80_CONST fldpi
3792IEMIMPL_FPU_R80_CONST fldlg2
3793IEMIMPL_FPU_R80_CONST fldln2
3794IEMIMPL_FPU_R80_CONST fldz
3795
3796
3797;;
3798; FPU instruction working on one 80-bit floating point value, outputing two.
3799;
3800; @param 1 The instruction
3801;
3802; @param A0 FPU context (fxsave).
3803; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3804; @param A2 Pointer to the 80-bit value.
3805;
3806%macro IEMIMPL_FPU_R80_R80 1
3807BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3808 PROLOGUE_3_ARGS
3809 sub xSP, 20h
3810
3811 fninit
3812 fld tword [A2]
3813 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3814 %1
3815
3816 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3817 fnclex
3818 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3819 fnclex
3820 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3821
3822 fninit
3823 add xSP, 20h
3824 EPILOGUE_3_ARGS
3825ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3826%endmacro
3827
3828IEMIMPL_FPU_R80_R80 fptan
3829IEMIMPL_FPU_R80_R80 fxtract
3830IEMIMPL_FPU_R80_R80 fsincos
3831
3832
3833
3834
3835;---------------------- SSE and MMX Operations ----------------------
3836
3837;; @todo what do we need to do for MMX?
3838%macro IEMIMPL_MMX_PROLOGUE 0
3839%endmacro
3840%macro IEMIMPL_MMX_EPILOGUE 0
3841%endmacro
3842
3843;; @todo what do we need to do for SSE?
3844%macro IEMIMPL_SSE_PROLOGUE 0
3845%endmacro
3846%macro IEMIMPL_SSE_EPILOGUE 0
3847%endmacro
3848
3849;; @todo what do we need to do for AVX?
3850%macro IEMIMPL_AVX_PROLOGUE 0
3851%endmacro
3852%macro IEMIMPL_AVX_EPILOGUE 0
3853%endmacro
3854
3855
3856;;
3857; Media instruction working on two full sized registers.
3858;
3859; @param 1 The instruction
3860; @param 2 Whether there is an MMX variant (1) or not (0).
3861;
3862; @param A0 FPU context (fxsave).
3863; @param A1 Pointer to the first media register size operand (input/output).
3864; @param A2 Pointer to the second media register size operand (input).
3865;
3866%macro IEMIMPL_MEDIA_F2 2
3867%if %2 != 0
3868BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3869 PROLOGUE_3_ARGS
3870 IEMIMPL_MMX_PROLOGUE
3871
3872 movq mm0, [A1]
3873 movq mm1, [A2]
3874 %1 mm0, mm1
3875 movq [A1], mm0
3876
3877 IEMIMPL_MMX_EPILOGUE
3878 EPILOGUE_3_ARGS
3879ENDPROC iemAImpl_ %+ %1 %+ _u64
3880%endif
3881
3882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3883 PROLOGUE_3_ARGS
3884 IEMIMPL_SSE_PROLOGUE
3885
3886 movdqu xmm0, [A1]
3887 movdqu xmm1, [A2]
3888 %1 xmm0, xmm1
3889 movdqu [A1], xmm0
3890
3891 IEMIMPL_SSE_EPILOGUE
3892 EPILOGUE_3_ARGS
3893ENDPROC iemAImpl_ %+ %1 %+ _u128
3894%endmacro
3895
3896IEMIMPL_MEDIA_F2 pshufb, 1
3897IEMIMPL_MEDIA_F2 pand, 1
3898IEMIMPL_MEDIA_F2 pandn, 1
3899IEMIMPL_MEDIA_F2 por, 1
3900IEMIMPL_MEDIA_F2 pxor, 1
3901IEMIMPL_MEDIA_F2 pcmpeqb, 1
3902IEMIMPL_MEDIA_F2 pcmpeqw, 1
3903IEMIMPL_MEDIA_F2 pcmpeqd, 1
3904IEMIMPL_MEDIA_F2 pcmpeqq, 0
3905IEMIMPL_MEDIA_F2 pcmpgtb, 1
3906IEMIMPL_MEDIA_F2 pcmpgtw, 1
3907IEMIMPL_MEDIA_F2 pcmpgtd, 1
3908IEMIMPL_MEDIA_F2 pcmpgtq, 0
3909IEMIMPL_MEDIA_F2 paddb, 1
3910IEMIMPL_MEDIA_F2 paddw, 1
3911IEMIMPL_MEDIA_F2 paddd, 1
3912IEMIMPL_MEDIA_F2 paddq, 1
3913IEMIMPL_MEDIA_F2 paddsb, 1
3914IEMIMPL_MEDIA_F2 paddsw, 1
3915IEMIMPL_MEDIA_F2 paddusb, 1
3916IEMIMPL_MEDIA_F2 paddusw, 1
3917IEMIMPL_MEDIA_F2 psubb, 1
3918IEMIMPL_MEDIA_F2 psubw, 1
3919IEMIMPL_MEDIA_F2 psubd, 1
3920IEMIMPL_MEDIA_F2 psubq, 1
3921IEMIMPL_MEDIA_F2 psubsb, 1
3922IEMIMPL_MEDIA_F2 psubsw, 1
3923IEMIMPL_MEDIA_F2 psubusb, 1
3924IEMIMPL_MEDIA_F2 psubusw, 1
3925IEMIMPL_MEDIA_F2 pmullw, 1
3926IEMIMPL_MEDIA_F2 pmulld, 0
3927IEMIMPL_MEDIA_F2 pmulhw, 1
3928IEMIMPL_MEDIA_F2 pmaddwd, 1
3929IEMIMPL_MEDIA_F2 pminub, 1
3930IEMIMPL_MEDIA_F2 pminuw, 0
3931IEMIMPL_MEDIA_F2 pminud, 0
3932IEMIMPL_MEDIA_F2 pminsb, 0
3933IEMIMPL_MEDIA_F2 pminsw, 1
3934IEMIMPL_MEDIA_F2 pminsd, 0
3935IEMIMPL_MEDIA_F2 pmaxub, 1
3936IEMIMPL_MEDIA_F2 pmaxuw, 0
3937IEMIMPL_MEDIA_F2 pmaxud, 0
3938IEMIMPL_MEDIA_F2 pmaxsb, 0
3939IEMIMPL_MEDIA_F2 pmaxsw, 1
3940IEMIMPL_MEDIA_F2 pmaxsd, 0
3941IEMIMPL_MEDIA_F2 pabsb, 1
3942IEMIMPL_MEDIA_F2 pabsw, 1
3943IEMIMPL_MEDIA_F2 pabsd, 1
3944IEMIMPL_MEDIA_F2 psignb, 1
3945IEMIMPL_MEDIA_F2 psignw, 1
3946IEMIMPL_MEDIA_F2 psignd, 1
3947IEMIMPL_MEDIA_F2 phaddw, 1
3948IEMIMPL_MEDIA_F2 phaddd, 1
3949IEMIMPL_MEDIA_F2 phsubw, 1
3950IEMIMPL_MEDIA_F2 phsubd, 1
3951IEMIMPL_MEDIA_F2 phaddsw, 1
3952IEMIMPL_MEDIA_F2 phsubsw, 1
3953IEMIMPL_MEDIA_F2 pmaddubsw, 1
3954IEMIMPL_MEDIA_F2 pmulhrsw, 1
3955IEMIMPL_MEDIA_F2 pmuludq, 1
3956
3957
3958;;
3959; Media instruction working on two full sized registers, but no FXSAVE state argument.
3960;
3961; @param 1 The instruction
3962; @param 2 Whether there is an MMX variant (1) or not (0).
3963;
3964; @param A0 Pointer to the first media register size operand (input/output).
3965; @param A1 Pointer to the second media register size operand (input).
3966;
3967%macro IEMIMPL_MEDIA_OPT_F2 2
3968%if %2 != 0
3969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3970 PROLOGUE_2_ARGS
3971 IEMIMPL_MMX_PROLOGUE
3972
3973 movq mm0, [A0]
3974 movq mm1, [A1]
3975 %1 mm0, mm1
3976 movq [A0], mm0
3977
3978 IEMIMPL_MMX_EPILOGUE
3979 EPILOGUE_2_ARGS
3980ENDPROC iemAImpl_ %+ %1 %+ _u64
3981%endif
3982
3983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3984 PROLOGUE_2_ARGS
3985 IEMIMPL_SSE_PROLOGUE
3986
3987 movdqu xmm0, [A0]
3988 movdqu xmm1, [A1]
3989 %1 xmm0, xmm1
3990 movdqu [A0], xmm0
3991
3992 IEMIMPL_SSE_EPILOGUE
3993 EPILOGUE_2_ARGS
3994ENDPROC iemAImpl_ %+ %1 %+ _u128
3995%endmacro
3996
3997IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3998IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3999IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4000IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4001IEMIMPL_MEDIA_OPT_F2 psllw, 1
4002IEMIMPL_MEDIA_OPT_F2 pslld, 1
4003IEMIMPL_MEDIA_OPT_F2 psllq, 1
4004IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4005IEMIMPL_MEDIA_OPT_F2 psrld, 1
4006IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4007IEMIMPL_MEDIA_OPT_F2 psraw, 1
4008IEMIMPL_MEDIA_OPT_F2 psrad, 1
4009IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4010IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4011IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4012IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4013IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4014IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4015IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4016IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4017IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4018IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4019IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4020IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4021IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4022IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4023IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4024IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4025IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4026IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4027IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4028IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4029
4030;;
4031; Media instruction working on one full sized and one half sized register (lower half).
4032;
4033; @param 1 The instruction
4034; @param 2 1 if MMX is included, 0 if not.
4035;
4036; @param A0 Pointer to the first full sized media register operand (input/output).
4037; @param A1 Pointer to the second half sized media register operand (input).
4038;
4039%macro IEMIMPL_MEDIA_F1L1 2
4040 %if %2 != 0
4041BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4042 PROLOGUE_2_ARGS
4043 IEMIMPL_MMX_PROLOGUE
4044
4045 movq mm0, [A0]
4046 movq mm1, [A1]
4047 %1 mm0, mm1
4048 movq [A0], mm0
4049
4050 IEMIMPL_MMX_EPILOGUE
4051 EPILOGUE_2_ARGS
4052ENDPROC iemAImpl_ %+ %1 %+ _u64
4053 %endif
4054
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4056 PROLOGUE_2_ARGS
4057 IEMIMPL_SSE_PROLOGUE
4058
4059 movdqu xmm0, [A0]
4060 movdqu xmm1, [A1]
4061 %1 xmm0, xmm1
4062 movdqu [A0], xmm0
4063
4064 IEMIMPL_SSE_EPILOGUE
4065 EPILOGUE_2_ARGS
4066ENDPROC iemAImpl_ %+ %1 %+ _u128
4067%endmacro
4068
4069IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4070IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4071IEMIMPL_MEDIA_F1L1 punpckldq, 1
4072IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4073
4074
4075;;
4076; Media instruction working two half sized input registers (lower half) and a full sized
4077; destination register (vpunpckh*).
4078;
4079; @param 1 The instruction
4080;
4081; @param A0 Pointer to the destination register (full sized, output only).
4082; @param A1 Pointer to the first full sized media source register operand, where we
4083; will only use the lower half as input - but we'll be loading it in full.
4084; @param A2 Pointer to the second full sized media source register operand, where we
4085; will only use the lower half as input - but we'll be loading it in full.
4086;
4087%macro IEMIMPL_MEDIA_F1L1L1 1
4088BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4089 PROLOGUE_3_ARGS
4090 IEMIMPL_AVX_PROLOGUE
4091
4092 vmovdqu xmm0, [A1]
4093 vmovdqu xmm1, [A2]
4094 %1 xmm0, xmm0, xmm1
4095 vmovdqu [A0], xmm0
4096
4097 IEMIMPL_AVX_PROLOGUE
4098 EPILOGUE_3_ARGS
4099ENDPROC iemAImpl_ %+ %1 %+ _u128
4100
4101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4102 PROLOGUE_3_ARGS
4103 IEMIMPL_AVX_PROLOGUE
4104
4105 vmovdqu ymm0, [A1]
4106 vmovdqu ymm1, [A2]
4107 %1 ymm0, ymm0, ymm1
4108 vmovdqu [A0], ymm0
4109
4110 IEMIMPL_AVX_PROLOGUE
4111 EPILOGUE_3_ARGS
4112ENDPROC iemAImpl_ %+ %1 %+ _u256
4113%endmacro
4114
4115IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4116IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4117IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4118IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4119
4120
4121;;
4122; Media instruction working on one full sized and one half sized register (high half).
4123;
4124; @param 1 The instruction
4125; @param 2 1 if MMX is included, 0 if not.
4126;
4127; @param A0 Pointer to the first full sized media register operand (input/output).
4128; @param A1 Pointer to the second full sized media register operand, where we
4129; will only use the upper half as input - but we'll load it in full.
4130;
4131%macro IEMIMPL_MEDIA_F1H1 2
4132IEMIMPL_MEDIA_F1L1 %1, %2
4133%endmacro
4134
4135IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4136IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4137IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4138IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4139
4140
4141;;
4142; Media instruction working two half sized input registers (high half) and a full sized
4143; destination register (vpunpckh*).
4144;
4145; @param 1 The instruction
4146;
4147; @param A0 Pointer to the destination register (full sized, output only).
4148; @param A1 Pointer to the first full sized media source register operand, where we
4149; will only use the upper half as input - but we'll be loading it in full.
4150; @param A2 Pointer to the second full sized media source register operand, where we
4151; will only use the upper half as input - but we'll be loading it in full.
4152;
4153%macro IEMIMPL_MEDIA_F1H1H1 1
4154IEMIMPL_MEDIA_F1L1L1 %1
4155%endmacro
4156
4157IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4158IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4159IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4160IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4161
4162
4163;
4164; Shufflers with evil 8-bit immediates.
4165;
4166
4167BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4168 PROLOGUE_3_ARGS
4169 IEMIMPL_MMX_PROLOGUE
4170
4171 movzx A2, A2_8 ; must clear top bits
4172 movq mm1, [A1]
4173 movq mm0, mm0 ; paranoia!
4174 lea T1, [.imm0 xWrtRIP]
4175 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4176 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4177 %else
4178 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4179 %endif
4180 lea T1, [T1 + T0]
4181 IBT_NOTRACK
4182 call T1
4183 movq [A0], mm0
4184
4185 IEMIMPL_MMX_EPILOGUE
4186 EPILOGUE_3_ARGS
4187%assign bImm 0
4188%rep 256
4189.imm %+ bImm:
4190 IBT_ENDBRxx_WITHOUT_NOTRACK
4191 pshufw mm0, mm1, bImm
4192 ret
4193 %assign bImm bImm + 1
4194%endrep
4195.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4196ENDPROC iemAImpl_pshufw_u64
4197
4198
4199%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4200BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4201 PROLOGUE_3_ARGS
4202 IEMIMPL_SSE_PROLOGUE
4203
4204 movzx A2, A2_8 ; must clear top bits
4205 movdqu xmm1, [A1]
4206 movdqu xmm0, xmm1 ; paranoia!
4207 lea T1, [.imm0 xWrtRIP]
4208 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4209 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4210 %else
4211 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4212 %endif
4213 lea T1, [T1 + T0*2]
4214 IBT_NOTRACK
4215 call T1
4216 movdqu [A0], xmm0
4217
4218 IEMIMPL_SSE_EPILOGUE
4219 EPILOGUE_3_ARGS
4220
4221 %assign bImm 0
4222 %rep 256
4223.imm %+ bImm:
4224 IBT_ENDBRxx_WITHOUT_NOTRACK
4225 %1 xmm0, xmm1, bImm
4226 ret
4227 %assign bImm bImm + 1
4228 %endrep
4229.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4230ENDPROC iemAImpl_ %+ %1 %+ _u128
4231%endmacro
4232
4233IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4234IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4235IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4236
4237
4238%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4239BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4240 PROLOGUE_3_ARGS
4241 IEMIMPL_SSE_PROLOGUE
4242
4243 movzx A2, A2_8 ; must clear top bits
4244 vmovdqu ymm1, [A1]
4245 vmovdqu ymm0, ymm1 ; paranoia!
4246 lea T1, [.imm0 xWrtRIP]
4247 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4248 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4249 %else
4250 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4251 %endif
4252 lea T1, [T1 + T0*2]
4253 IBT_NOTRACK
4254 call T1
4255 vmovdqu [A0], ymm0
4256
4257 IEMIMPL_SSE_EPILOGUE
4258 EPILOGUE_3_ARGS
4259 %assign bImm 0
4260 %rep 256
4261.imm %+ bImm:
4262 IBT_ENDBRxx_WITHOUT_NOTRACK
4263 %1 ymm0, ymm1, bImm
4264 ret
4265 %assign bImm bImm + 1
4266 %endrep
4267.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269%endmacro
4270
4271IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4272IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4273IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4274
4275
4276;
4277; Shifts with evil 8-bit immediates.
4278;
4279
4280%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4282 PROLOGUE_2_ARGS
4283 IEMIMPL_MMX_PROLOGUE
4284
4285 movzx A1, A1_8 ; must clear top bits
4286 movq mm0, [A0]
4287 lea T1, [.imm0 xWrtRIP]
4288 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4289 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4290 %else
4291 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4292 %endif
4293 lea T1, [T1 + T0]
4294 IBT_NOTRACK
4295 call T1
4296 movq [A0], mm0
4297
4298 IEMIMPL_MMX_EPILOGUE
4299 EPILOGUE_2_ARGS
4300%assign bImm 0
4301%rep 256
4302.imm %+ bImm:
4303 IBT_ENDBRxx_WITHOUT_NOTRACK
4304 %1 mm0, bImm
4305 ret
4306 %assign bImm bImm + 1
4307%endrep
4308.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4309ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4310%endmacro
4311
4312IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4313IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4314IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4315IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4316IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4317IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4318IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4319IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4320
4321
4322%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4323BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4324 PROLOGUE_2_ARGS
4325 IEMIMPL_SSE_PROLOGUE
4326
4327 movzx A1, A1_8 ; must clear top bits
4328 movdqu xmm0, [A0]
4329 lea T1, [.imm0 xWrtRIP]
4330 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4331 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4332 %else
4333 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4334 %endif
4335 lea T1, [T1 + T0*2]
4336 IBT_NOTRACK
4337 call T1
4338 movdqu [A0], xmm0
4339
4340 IEMIMPL_SSE_EPILOGUE
4341 EPILOGUE_2_ARGS
4342 %assign bImm 0
4343 %rep 256
4344.imm %+ bImm:
4345 IBT_ENDBRxx_WITHOUT_NOTRACK
4346 %1 xmm0, bImm
4347 ret
4348 %assign bImm bImm + 1
4349 %endrep
4350.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4351ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4352%endmacro
4353
4354IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4355IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4356IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4357IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4358IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4359IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4360IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4361IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4362IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4363IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4364
4365
4366;
4367; Move byte mask.
4368;
4369
4370BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4371 PROLOGUE_2_ARGS
4372 IEMIMPL_MMX_PROLOGUE
4373
4374 movq mm1, [A1]
4375 pmovmskb T0, mm1
4376 mov [A0], T0
4377%ifdef RT_ARCH_X86
4378 mov dword [A0 + 4], 0
4379%endif
4380 IEMIMPL_MMX_EPILOGUE
4381 EPILOGUE_2_ARGS
4382ENDPROC iemAImpl_pmovmskb_u64
4383
4384BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4385 PROLOGUE_2_ARGS
4386 IEMIMPL_SSE_PROLOGUE
4387
4388 movdqu xmm1, [A1]
4389 pmovmskb T0, xmm1
4390 mov [A0], T0
4391%ifdef RT_ARCH_X86
4392 mov dword [A0 + 4], 0
4393%endif
4394 IEMIMPL_SSE_EPILOGUE
4395 EPILOGUE_2_ARGS
4396ENDPROC iemAImpl_pmovmskb_u128
4397
4398BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4399 PROLOGUE_2_ARGS
4400 IEMIMPL_AVX_PROLOGUE
4401
4402 vmovdqu ymm1, [A1]
4403 vpmovmskb T0, ymm1
4404 mov [A0], T0
4405%ifdef RT_ARCH_X86
4406 mov dword [A0 + 4], 0
4407%endif
4408 IEMIMPL_AVX_EPILOGUE
4409 EPILOGUE_2_ARGS
4410ENDPROC iemAImpl_vpmovmskb_u256
4411
4412
4413;;
4414; Media instruction working on two full sized source registers and one destination (AVX).
4415;
4416; @param 1 The instruction
4417;
4418; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4419; @param A1 Pointer to the destination media register size operand (output).
4420; @param A2 Pointer to the first source media register size operand (input).
4421; @param A3 Pointer to the second source media register size operand (input).
4422;
4423%macro IEMIMPL_MEDIA_F3 1
4424BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4425 PROLOGUE_4_ARGS
4426 IEMIMPL_AVX_PROLOGUE
4427
4428 vmovdqu xmm0, [A2]
4429 vmovdqu xmm1, [A3]
4430 %1 xmm0, xmm0, xmm1
4431 vmovdqu [A1], xmm0
4432
4433 IEMIMPL_AVX_PROLOGUE
4434 EPILOGUE_4_ARGS
4435ENDPROC iemAImpl_ %+ %1 %+ _u128
4436
4437BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4438 PROLOGUE_4_ARGS
4439 IEMIMPL_AVX_PROLOGUE
4440
4441 vmovdqu ymm0, [A2]
4442 vmovdqu ymm1, [A3]
4443 %1 ymm0, ymm0, ymm1
4444 vmovdqu [A1], ymm0
4445
4446 IEMIMPL_AVX_PROLOGUE
4447 EPILOGUE_4_ARGS
4448ENDPROC iemAImpl_ %+ %1 %+ _u256
4449%endmacro
4450
4451IEMIMPL_MEDIA_F3 vpshufb
4452IEMIMPL_MEDIA_F3 vpand
4453IEMIMPL_MEDIA_F3 vpminub
4454IEMIMPL_MEDIA_F3 vpminuw
4455IEMIMPL_MEDIA_F3 vpminud
4456IEMIMPL_MEDIA_F3 vpminsb
4457IEMIMPL_MEDIA_F3 vpminsw
4458IEMIMPL_MEDIA_F3 vpminsd
4459IEMIMPL_MEDIA_F3 vpmaxub
4460IEMIMPL_MEDIA_F3 vpmaxuw
4461IEMIMPL_MEDIA_F3 vpmaxud
4462IEMIMPL_MEDIA_F3 vpmaxsb
4463IEMIMPL_MEDIA_F3 vpmaxsw
4464IEMIMPL_MEDIA_F3 vpmaxsd
4465IEMIMPL_MEDIA_F3 vpandn
4466IEMIMPL_MEDIA_F3 vpor
4467IEMIMPL_MEDIA_F3 vpxor
4468IEMIMPL_MEDIA_F3 vpcmpeqb
4469IEMIMPL_MEDIA_F3 vpcmpeqw
4470IEMIMPL_MEDIA_F3 vpcmpeqd
4471IEMIMPL_MEDIA_F3 vpcmpeqq
4472IEMIMPL_MEDIA_F3 vpcmpgtb
4473IEMIMPL_MEDIA_F3 vpcmpgtw
4474IEMIMPL_MEDIA_F3 vpcmpgtd
4475IEMIMPL_MEDIA_F3 vpcmpgtq
4476IEMIMPL_MEDIA_F3 vpaddb
4477IEMIMPL_MEDIA_F3 vpaddw
4478IEMIMPL_MEDIA_F3 vpaddd
4479IEMIMPL_MEDIA_F3 vpaddq
4480IEMIMPL_MEDIA_F3 vpsubb
4481IEMIMPL_MEDIA_F3 vpsubw
4482IEMIMPL_MEDIA_F3 vpsubd
4483IEMIMPL_MEDIA_F3 vpsubq
4484
4485
4486;;
4487; Media instruction working on two full sized source registers and one destination (AVX),
4488; but no XSAVE state pointer argument.
4489;
4490; @param 1 The instruction
4491;
4492; @param A0 Pointer to the destination media register size operand (output).
4493; @param A1 Pointer to the first source media register size operand (input).
4494; @param A2 Pointer to the second source media register size operand (input).
4495;
4496%macro IEMIMPL_MEDIA_OPT_F3 1
4497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4498 PROLOGUE_3_ARGS
4499 IEMIMPL_AVX_PROLOGUE
4500
4501 vmovdqu xmm0, [A1]
4502 vmovdqu xmm1, [A2]
4503 %1 xmm0, xmm0, xmm1
4504 vmovdqu [A0], xmm0
4505
4506 IEMIMPL_AVX_PROLOGUE
4507 EPILOGUE_3_ARGS
4508ENDPROC iemAImpl_ %+ %1 %+ _u128
4509
4510BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4511 PROLOGUE_3_ARGS
4512 IEMIMPL_AVX_PROLOGUE
4513
4514 vmovdqu ymm0, [A1]
4515 vmovdqu ymm1, [A2]
4516 %1 ymm0, ymm0, ymm1
4517 vmovdqu [A0], ymm0
4518
4519 IEMIMPL_AVX_PROLOGUE
4520 EPILOGUE_3_ARGS
4521ENDPROC iemAImpl_ %+ %1 %+ _u256
4522%endmacro
4523
4524IEMIMPL_MEDIA_OPT_F3 vpacksswb
4525IEMIMPL_MEDIA_OPT_F3 vpackssdw
4526IEMIMPL_MEDIA_OPT_F3 vpackuswb
4527IEMIMPL_MEDIA_OPT_F3 vpackusdw
4528IEMIMPL_MEDIA_OPT_F3 vpmullw
4529IEMIMPL_MEDIA_OPT_F3 vpmulld
4530IEMIMPL_MEDIA_OPT_F3 vpmulhw
4531IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4532IEMIMPL_MEDIA_OPT_F3 vpavgb
4533IEMIMPL_MEDIA_OPT_F3 vpavgw
4534IEMIMPL_MEDIA_OPT_F3 vpsignb
4535IEMIMPL_MEDIA_OPT_F3 vpsignw
4536IEMIMPL_MEDIA_OPT_F3 vpsignd
4537IEMIMPL_MEDIA_OPT_F3 vphaddw
4538IEMIMPL_MEDIA_OPT_F3 vphaddd
4539IEMIMPL_MEDIA_OPT_F3 vphsubw
4540IEMIMPL_MEDIA_OPT_F3 vphsubd
4541IEMIMPL_MEDIA_OPT_F3 vphaddsw
4542IEMIMPL_MEDIA_OPT_F3 vphsubsw
4543IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4544IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4545IEMIMPL_MEDIA_OPT_F3 vpsadbw
4546IEMIMPL_MEDIA_OPT_F3 vpmuldq
4547IEMIMPL_MEDIA_OPT_F3 vpmuludq
4548IEMIMPL_MEDIA_OPT_F3 vunpcklps
4549IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4550IEMIMPL_MEDIA_OPT_F3 vunpckhps
4551IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4552IEMIMPL_MEDIA_OPT_F3 vpsubsb
4553IEMIMPL_MEDIA_OPT_F3 vpsubsw
4554IEMIMPL_MEDIA_OPT_F3 vpsubusb
4555IEMIMPL_MEDIA_OPT_F3 vpsubusw
4556IEMIMPL_MEDIA_OPT_F3 vpaddusb
4557IEMIMPL_MEDIA_OPT_F3 vpaddusw
4558IEMIMPL_MEDIA_OPT_F3 vpaddsb
4559IEMIMPL_MEDIA_OPT_F3 vpaddsw
4560IEMIMPL_MEDIA_OPT_F3 vpermilps
4561IEMIMPL_MEDIA_OPT_F3 vpermilpd
4562IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4563IEMIMPL_MEDIA_OPT_F3 vpsrlvd
4564IEMIMPL_MEDIA_OPT_F3 vpsrlvq
4565IEMIMPL_MEDIA_OPT_F3 vpsravd
4566IEMIMPL_MEDIA_OPT_F3 vpsllvd
4567IEMIMPL_MEDIA_OPT_F3 vpsllvq
4568
4569;;
4570; Media instruction working on one full sized source register, one full sized destination
4571; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4572; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4573; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4574; of either 16, 32, or 64, it acts like the max shift size)
4575;
4576; @param 1 The instruction
4577;
4578; @param A0 Pointer to the destination media register size operand (output).
4579; @param A1 Pointer to the first source media register size operand (input).
4580; @param A2 Pointer to the second source media register size operand (input).
4581;
4582%macro IEMIMPL_SHIFT_OPT_F3 1
4583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4584 PROLOGUE_3_ARGS
4585 IEMIMPL_AVX_PROLOGUE
4586
4587 vmovdqu xmm0, [A1]
4588 vmovdqu xmm1, [A2]
4589 %1 xmm0, xmm0, xmm1
4590 vmovdqu [A0], xmm0
4591
4592 IEMIMPL_AVX_PROLOGUE
4593 EPILOGUE_3_ARGS
4594ENDPROC iemAImpl_ %+ %1 %+ _u128
4595
4596BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4597 PROLOGUE_3_ARGS
4598 IEMIMPL_AVX_PROLOGUE
4599
4600 vmovdqu ymm0, [A1]
4601 vmovdqu xmm1, [A2]
4602 %1 ymm0, ymm0, xmm1
4603 vmovdqu [A0], ymm0
4604
4605 IEMIMPL_AVX_PROLOGUE
4606 EPILOGUE_3_ARGS
4607ENDPROC iemAImpl_ %+ %1 %+ _u256
4608%endmacro
4609
4610IEMIMPL_SHIFT_OPT_F3 vpsllw
4611IEMIMPL_SHIFT_OPT_F3 vpslld
4612IEMIMPL_SHIFT_OPT_F3 vpsllq
4613IEMIMPL_SHIFT_OPT_F3 vpsraw
4614IEMIMPL_SHIFT_OPT_F3 vpsrad
4615IEMIMPL_SHIFT_OPT_F3 vpsrlw
4616IEMIMPL_SHIFT_OPT_F3 vpsrld
4617IEMIMPL_SHIFT_OPT_F3 vpsrlq
4618
4619
4620;;
4621; Media instruction working on one full sized source registers and one destination (AVX),
4622; but no XSAVE state pointer argument.
4623;
4624; @param 1 The instruction
4625; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4626;
4627; @param A0 Pointer to the destination media register size operand (output).
4628; @param A1 Pointer to the source media register size operand (input).
4629;
4630%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4631BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4632 PROLOGUE_2_ARGS
4633 IEMIMPL_AVX_PROLOGUE
4634
4635 vmovdqu xmm0, [A1]
4636 %1 xmm0, xmm0
4637 vmovdqu [A0], xmm0
4638
4639 IEMIMPL_AVX_PROLOGUE
4640 EPILOGUE_2_ARGS
4641ENDPROC iemAImpl_ %+ %1 %+ _u128
4642
4643 %if %2 == 1
4644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4645 PROLOGUE_2_ARGS
4646 IEMIMPL_AVX_PROLOGUE
4647
4648 vmovdqu ymm0, [A1]
4649 %1 ymm0, ymm0
4650 vmovdqu [A0], ymm0
4651
4652 IEMIMPL_AVX_PROLOGUE
4653 EPILOGUE_2_ARGS
4654ENDPROC iemAImpl_ %+ %1 %+ _u256
4655 %endif
4656%endmacro
4657
4658IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4659IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4660IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4661IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4662
4663
4664;
4665; The SSE 4.2 crc32
4666;
4667; @param A1 Pointer to the 32-bit destination.
4668; @param A2 The source operand, sized according to the suffix.
4669;
4670BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4671 PROLOGUE_2_ARGS
4672
4673 mov T0_32, [A0]
4674 crc32 T0_32, A1_8
4675 mov [A0], T0_32
4676
4677 EPILOGUE_2_ARGS
4678ENDPROC iemAImpl_crc32_u8
4679
4680BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4681 PROLOGUE_2_ARGS
4682
4683 mov T0_32, [A0]
4684 crc32 T0_32, A1_16
4685 mov [A0], T0_32
4686
4687 EPILOGUE_2_ARGS
4688ENDPROC iemAImpl_crc32_u16
4689
4690BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4691 PROLOGUE_2_ARGS
4692
4693 mov T0_32, [A0]
4694 crc32 T0_32, A1_32
4695 mov [A0], T0_32
4696
4697 EPILOGUE_2_ARGS
4698ENDPROC iemAImpl_crc32_u32
4699
4700%ifdef RT_ARCH_AMD64
4701BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4702 PROLOGUE_2_ARGS
4703
4704 mov T0_32, [A0]
4705 crc32 T0, A1
4706 mov [A0], T0_32
4707
4708 EPILOGUE_2_ARGS
4709ENDPROC iemAImpl_crc32_u64
4710%endif
4711
4712
4713;
4714; PTEST (SSE 4.1)
4715;
4716; @param A0 Pointer to the first source operand (aka readonly destination).
4717; @param A1 Pointer to the second source operand.
4718; @param A2 Pointer to the EFLAGS register.
4719;
4720BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4721 PROLOGUE_3_ARGS
4722 IEMIMPL_SSE_PROLOGUE
4723
4724 movdqu xmm0, [A0]
4725 movdqu xmm1, [A1]
4726 ptest xmm0, xmm1
4727 IEM_SAVE_FLAGS A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
4728
4729 IEMIMPL_SSE_EPILOGUE
4730 EPILOGUE_3_ARGS
4731ENDPROC iemAImpl_ptest_u128
4732
4733BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4734 PROLOGUE_3_ARGS
4735 IEMIMPL_SSE_PROLOGUE
4736
4737 vmovdqu ymm0, [A0]
4738 vmovdqu ymm1, [A1]
4739 vptest ymm0, ymm1
4740 IEM_SAVE_FLAGS A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
4741
4742 IEMIMPL_SSE_EPILOGUE
4743 EPILOGUE_3_ARGS
4744ENDPROC iemAImpl_vptest_u256
4745
4746
4747;;
4748; Template for the [v]pmov{s,z}x* instructions
4749;
4750; @param 1 The instruction
4751;
4752; @param A0 Pointer to the destination media register size operand (output).
4753; @param A1 The source operand value (input).
4754;
4755%macro IEMIMPL_V_PMOV_SZ_X 1
4756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4757 PROLOGUE_2_ARGS
4758 IEMIMPL_SSE_PROLOGUE
4759
4760 movd xmm0, A1
4761 %1 xmm0, xmm0
4762 vmovdqu [A0], xmm0
4763
4764 IEMIMPL_SSE_PROLOGUE
4765 EPILOGUE_2_ARGS
4766ENDPROC iemAImpl_ %+ %1 %+ _u128
4767
4768BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4769 PROLOGUE_2_ARGS
4770 IEMIMPL_AVX_PROLOGUE
4771
4772 movd xmm0, A1
4773 v %+ %1 xmm0, xmm0
4774 vmovdqu [A0], xmm0
4775
4776 IEMIMPL_AVX_PROLOGUE
4777 EPILOGUE_2_ARGS
4778ENDPROC iemAImpl_v %+ %1 %+ _u128
4779
4780BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4781 PROLOGUE_2_ARGS
4782 IEMIMPL_AVX_PROLOGUE
4783
4784 movdqu xmm0, [A1]
4785 v %+ %1 ymm0, xmm0
4786 vmovdqu [A0], ymm0
4787
4788 IEMIMPL_AVX_PROLOGUE
4789 EPILOGUE_2_ARGS
4790ENDPROC iemAImpl_v %+ %1 %+ _u256
4791%endmacro
4792
4793IEMIMPL_V_PMOV_SZ_X pmovsxbw
4794IEMIMPL_V_PMOV_SZ_X pmovsxbd
4795IEMIMPL_V_PMOV_SZ_X pmovsxbq
4796IEMIMPL_V_PMOV_SZ_X pmovsxwd
4797IEMIMPL_V_PMOV_SZ_X pmovsxwq
4798IEMIMPL_V_PMOV_SZ_X pmovsxdq
4799
4800IEMIMPL_V_PMOV_SZ_X pmovzxbw
4801IEMIMPL_V_PMOV_SZ_X pmovzxbd
4802IEMIMPL_V_PMOV_SZ_X pmovzxbq
4803IEMIMPL_V_PMOV_SZ_X pmovzxwd
4804IEMIMPL_V_PMOV_SZ_X pmovzxwq
4805IEMIMPL_V_PMOV_SZ_X pmovzxdq
4806
4807
4808;;
4809; Need to move this as well somewhere better?
4810;
4811struc IEMSSERESULT
4812 .uResult resd 4
4813 .MXCSR resd 1
4814endstruc
4815
4816
4817;;
4818; Need to move this as well somewhere better?
4819;
4820struc IEMAVX128RESULT
4821 .uResult resd 4
4822 .MXCSR resd 1
4823endstruc
4824
4825
4826;;
4827; Need to move this as well somewhere better?
4828;
4829struc IEMAVX256RESULT
4830 .uResult resd 8
4831 .MXCSR resd 1
4832endstruc
4833
4834
4835;;
4836; Initialize the SSE MXCSR register using the guest value partially to
4837; account for rounding mode.
4838;
4839; @uses 4 bytes of stack to save the original value, T0.
4840; @param 1 Expression giving the address of the FXSTATE of the guest.
4841;
4842%macro SSE_LD_FXSTATE_MXCSR 1
4843 sub xSP, 4
4844
4845 stmxcsr [xSP]
4846 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4847 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4848 or T0_32, X86_MXCSR_XCPT_MASK
4849 sub xSP, 4
4850 mov [xSP], T0_32
4851 ldmxcsr [xSP]
4852 add xSP, 4
4853%endmacro
4854
4855
4856;;
4857; Restores the SSE MXCSR register with the original value.
4858;
4859; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4860; @param 1 Expression giving the address where to return the MXCSR value.
4861; @param 2 Expression giving the address of the FXSTATE of the guest.
4862;
4863; @note Restores the stack pointer.
4864;
4865%macro SSE_ST_FXSTATE_MXCSR 2
4866 sub xSP, 4
4867 stmxcsr [xSP]
4868 mov T0_32, [xSP]
4869 add xSP, 4
4870 ; Merge the status bits into the original MXCSR value.
4871 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4872 and T0_32, X86_MXCSR_XCPT_FLAGS
4873 or T0_32, T1_32
4874 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4875
4876 ldmxcsr [xSP]
4877 add xSP, 4
4878%endmacro
4879
4880
4881;;
4882; Initialize the SSE MXCSR register using the guest value partially to
4883; account for rounding mode.
4884;
4885; @uses 4 bytes of stack to save the original value.
4886; @param 1 Expression giving the address of the FXSTATE of the guest.
4887;
4888%macro AVX_LD_XSAVEAREA_MXCSR 1
4889 sub xSP, 4
4890
4891 stmxcsr [xSP]
4892 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4893 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4894 sub xSP, 4
4895 mov [xSP], T0_32
4896 ldmxcsr [xSP]
4897 add xSP, 4
4898%endmacro
4899
4900
4901;;
4902; Restores the AVX128 MXCSR register with the original value.
4903;
4904; @param 1 Expression giving the address where to return the MXCSR value.
4905;
4906; @note Restores the stack pointer.
4907;
4908%macro AVX128_ST_XSAVEAREA_MXCSR 1
4909 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4910
4911 ldmxcsr [xSP]
4912 add xSP, 4
4913%endmacro
4914
4915
4916;;
4917; Restores the AVX256 MXCSR register with the original value.
4918;
4919; @param 1 Expression giving the address where to return the MXCSR value.
4920;
4921; @note Restores the stack pointer.
4922;
4923%macro AVX256_ST_XSAVEAREA_MXCSR 1
4924 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4925
4926 ldmxcsr [xSP]
4927 add xSP, 4
4928%endmacro
4929
4930
4931;;
4932; Floating point instruction working on two full sized registers.
4933;
4934; @param 1 The instruction
4935; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4936;
4937; @param A0 FPU context (FXSTATE or XSAVEAREA).
4938; @param A1 Where to return the result including the MXCSR value.
4939; @param A2 Pointer to the first media register size operand (input/output).
4940; @param A3 Pointer to the second media register size operand (input).
4941;
4942%macro IEMIMPL_FP_F2 2
4943BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4944 PROLOGUE_4_ARGS
4945 IEMIMPL_SSE_PROLOGUE
4946 SSE_LD_FXSTATE_MXCSR A0
4947
4948 movdqu xmm0, [A2]
4949 movdqu xmm1, [A3]
4950 %1 xmm0, xmm1
4951 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4952
4953 SSE_ST_FXSTATE_MXCSR A1, A0
4954 IEMIMPL_SSE_PROLOGUE
4955 EPILOGUE_4_ARGS
4956ENDPROC iemAImpl_ %+ %1 %+ _u128
4957
4958 %if %2 == 3
4959BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4960 PROLOGUE_4_ARGS
4961 IEMIMPL_AVX_PROLOGUE
4962 AVX_LD_XSAVEAREA_MXCSR A0
4963
4964 vmovdqu xmm0, [A2]
4965 vmovdqu xmm1, [A3]
4966 v %+ %1 xmm0, xmm0, xmm1
4967 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4968
4969 AVX128_ST_XSAVEAREA_MXCSR A1
4970 IEMIMPL_AVX_PROLOGUE
4971 EPILOGUE_4_ARGS
4972ENDPROC iemAImpl_v %+ %1 %+ _u128
4973
4974BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4975 PROLOGUE_4_ARGS
4976 IEMIMPL_AVX_PROLOGUE
4977 AVX_LD_XSAVEAREA_MXCSR A0
4978
4979 vmovdqu ymm0, [A2]
4980 vmovdqu ymm1, [A3]
4981 v %+ %1 ymm0, ymm0, ymm1
4982 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4983
4984 AVX256_ST_XSAVEAREA_MXCSR A1
4985 IEMIMPL_AVX_PROLOGUE
4986 EPILOGUE_4_ARGS
4987ENDPROC iemAImpl_v %+ %1 %+ _u256
4988 %elif %2 == 2
4989BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4990 PROLOGUE_4_ARGS
4991 IEMIMPL_AVX_PROLOGUE
4992 AVX_LD_XSAVEAREA_MXCSR A0
4993
4994 vmovdqu xmm0, [A2]
4995 vmovdqu xmm1, [A3]
4996 v %+ %1 xmm0, xmm1
4997 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4998
4999 AVX128_ST_XSAVEAREA_MXCSR A1
5000 IEMIMPL_AVX_PROLOGUE
5001 EPILOGUE_4_ARGS
5002ENDPROC iemAImpl_v %+ %1 %+ _u128
5003
5004BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5005 PROLOGUE_4_ARGS
5006 IEMIMPL_AVX_PROLOGUE
5007 AVX_LD_XSAVEAREA_MXCSR A0
5008
5009 vmovdqu ymm0, [A2]
5010 vmovdqu ymm1, [A3]
5011 v %+ %1 ymm0, ymm1
5012 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5013
5014 AVX256_ST_XSAVEAREA_MXCSR A1
5015 IEMIMPL_AVX_PROLOGUE
5016 EPILOGUE_4_ARGS
5017ENDPROC iemAImpl_v %+ %1 %+ _u256
5018 %endif
5019%endmacro
5020
5021IEMIMPL_FP_F2 addps, 3
5022IEMIMPL_FP_F2 addpd, 3
5023IEMIMPL_FP_F2 mulps, 3
5024IEMIMPL_FP_F2 mulpd, 3
5025IEMIMPL_FP_F2 subps, 3
5026IEMIMPL_FP_F2 subpd, 3
5027IEMIMPL_FP_F2 minps, 3
5028IEMIMPL_FP_F2 minpd, 3
5029IEMIMPL_FP_F2 divps, 3
5030IEMIMPL_FP_F2 divpd, 3
5031IEMIMPL_FP_F2 maxps, 3
5032IEMIMPL_FP_F2 maxpd, 3
5033IEMIMPL_FP_F2 haddps, 3
5034IEMIMPL_FP_F2 haddpd, 3
5035IEMIMPL_FP_F2 hsubps, 3
5036IEMIMPL_FP_F2 hsubpd, 3
5037IEMIMPL_FP_F2 addsubps, 3
5038IEMIMPL_FP_F2 addsubpd, 3
5039
5040
5041;;
5042; These are actually unary operations but to keep it simple
5043; we treat them as binary for now, so the output result is
5044; always in sync with the register where the result might get written
5045; to.
5046IEMIMPL_FP_F2 sqrtps, 2
5047IEMIMPL_FP_F2 rsqrtps, 2
5048IEMIMPL_FP_F2 sqrtpd, 2
5049IEMIMPL_FP_F2 rcpps, 2
5050IEMIMPL_FP_F2 cvtdq2ps, 2
5051IEMIMPL_FP_F2 cvtps2dq, 2
5052IEMIMPL_FP_F2 cvttps2dq, 2
5053IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5054IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5055IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5056
5057
5058;;
5059; Floating point instruction working on a full sized register and a single precision operand.
5060;
5061; @param 1 The instruction
5062;
5063; @param A0 FPU context (FXSTATE or XSAVEAREA).
5064; @param A1 Where to return the result including the MXCSR value.
5065; @param A2 Pointer to the first media register size operand (input/output).
5066; @param A3 Pointer to the second single precision floating point value (input).
5067;
5068%macro IEMIMPL_FP_F2_R32 1
5069BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5070 PROLOGUE_4_ARGS
5071 IEMIMPL_SSE_PROLOGUE
5072 SSE_LD_FXSTATE_MXCSR A0
5073
5074 movdqu xmm0, [A2]
5075 movd xmm1, [A3]
5076 %1 xmm0, xmm1
5077 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5078
5079 SSE_ST_FXSTATE_MXCSR A1, A0
5080 IEMIMPL_SSE_EPILOGUE
5081 EPILOGUE_4_ARGS
5082ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5083
5084BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5085 PROLOGUE_4_ARGS
5086 IEMIMPL_AVX_PROLOGUE
5087 AVX_LD_XSAVEAREA_MXCSR A0
5088
5089 vmovdqu xmm0, [A2]
5090 vmovd xmm1, [A3]
5091 v %+ %1 xmm0, xmm0, xmm1
5092 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5093
5094 AVX128_ST_XSAVEAREA_MXCSR A1
5095 IEMIMPL_AVX_PROLOGUE
5096 EPILOGUE_4_ARGS
5097ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5098%endmacro
5099
5100IEMIMPL_FP_F2_R32 addss
5101IEMIMPL_FP_F2_R32 mulss
5102IEMIMPL_FP_F2_R32 subss
5103IEMIMPL_FP_F2_R32 minss
5104IEMIMPL_FP_F2_R32 divss
5105IEMIMPL_FP_F2_R32 maxss
5106IEMIMPL_FP_F2_R32 cvtss2sd
5107IEMIMPL_FP_F2_R32 sqrtss
5108IEMIMPL_FP_F2_R32 rsqrtss
5109IEMIMPL_FP_F2_R32 rcpss
5110
5111
5112;;
5113; Floating point instruction working on a full sized register and a double precision operand.
5114;
5115; @param 1 The instruction
5116;
5117; @param A0 FPU context (FXSTATE or XSAVEAREA).
5118; @param A1 Where to return the result including the MXCSR value.
5119; @param A2 Pointer to the first media register size operand (input/output).
5120; @param A3 Pointer to the second double precision floating point value (input).
5121;
5122%macro IEMIMPL_FP_F2_R64 1
5123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5124 PROLOGUE_4_ARGS
5125 IEMIMPL_SSE_PROLOGUE
5126 SSE_LD_FXSTATE_MXCSR A0
5127
5128 movdqu xmm0, [A2]
5129 movq xmm1, [A3]
5130 %1 xmm0, xmm1
5131 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5132
5133 SSE_ST_FXSTATE_MXCSR A1, A0
5134 IEMIMPL_SSE_EPILOGUE
5135 EPILOGUE_4_ARGS
5136ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5137
5138BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5139 PROLOGUE_4_ARGS
5140 IEMIMPL_AVX_PROLOGUE
5141 AVX_LD_XSAVEAREA_MXCSR A0
5142
5143 vmovdqu xmm0, [A2]
5144 vmovq xmm1, [A3]
5145 v %+ %1 xmm0, xmm0, xmm1
5146 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5147
5148 AVX128_ST_XSAVEAREA_MXCSR A1
5149 IEMIMPL_AVX_EPILOGUE
5150 EPILOGUE_4_ARGS
5151ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5152%endmacro
5153
5154IEMIMPL_FP_F2_R64 addsd
5155IEMIMPL_FP_F2_R64 mulsd
5156IEMIMPL_FP_F2_R64 subsd
5157IEMIMPL_FP_F2_R64 minsd
5158IEMIMPL_FP_F2_R64 divsd
5159IEMIMPL_FP_F2_R64 maxsd
5160IEMIMPL_FP_F2_R64 cvtsd2ss
5161IEMIMPL_FP_F2_R64 sqrtsd
5162
5163
5164;;
5165; Macro for the cvtpd2ps/cvtps2pd instructions.
5166;
5167; 1 The instruction name.
5168; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5169;
5170; @param A0 FPU context (FXSTATE or XSAVEAREA).
5171; @param A1 Where to return the result including the MXCSR value.
5172; @param A2 Pointer to the first media register size operand (input/output).
5173; @param A3 Pointer to the second media register size operand (input).
5174;
5175%macro IEMIMPL_CVT_F2 2
5176BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5177 PROLOGUE_4_ARGS
5178 IEMIMPL_SSE_PROLOGUE
5179 SSE_LD_FXSTATE_MXCSR A0
5180
5181 movdqu xmm0, [A2]
5182 movdqu xmm1, [A3]
5183 %1 xmm0, xmm1
5184 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5185
5186 SSE_ST_FXSTATE_MXCSR A1, A0
5187 IEMIMPL_SSE_EPILOGUE
5188 EPILOGUE_4_ARGS
5189ENDPROC iemAImpl_ %+ %1 %+ _u128
5190
5191BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5192 PROLOGUE_4_ARGS
5193 IEMIMPL_AVX_PROLOGUE
5194 AVX_LD_XSAVEAREA_MXCSR A0
5195
5196 vmovdqu xmm0, [A2]
5197 vmovdqu xmm1, [A3]
5198 v %+ %1 xmm0, xmm1
5199 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5200
5201 AVX128_ST_XSAVEAREA_MXCSR A1
5202 IEMIMPL_AVX_EPILOGUE
5203 EPILOGUE_4_ARGS
5204ENDPROC iemAImpl_v %+ %1 %+ _u128
5205
5206BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5207 PROLOGUE_4_ARGS
5208 IEMIMPL_AVX_PROLOGUE
5209 AVX_LD_XSAVEAREA_MXCSR A0
5210
5211 vmovdqu ymm0, [A2]
5212 vmovdqu ymm1, [A3]
5213 %if %2 == 0
5214 v %+ %1 xmm0, ymm1
5215 %else
5216 v %+ %1 ymm0, xmm1
5217 %endif
5218 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5219
5220 AVX256_ST_XSAVEAREA_MXCSR A1
5221 IEMIMPL_AVX_EPILOGUE
5222 EPILOGUE_4_ARGS
5223ENDPROC iemAImpl_v %+ %1 %+ _u256
5224%endmacro
5225
5226IEMIMPL_CVT_F2 cvtpd2ps, 0
5227IEMIMPL_CVT_F2 cvtps2pd, 1
5228
5229
5230;;
5231; shufps instructions with 8-bit immediates.
5232;
5233; @param A0 Pointer to the destination media register size operand (input/output).
5234; @param A1 Pointer to the first source media register size operand (input).
5235; @param A2 The 8-bit immediate
5236;
5237BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5238 PROLOGUE_3_ARGS
5239 IEMIMPL_SSE_PROLOGUE
5240
5241 movzx A2, A2_8 ; must clear top bits
5242 movdqu xmm0, [A0]
5243 movdqu xmm1, [A1]
5244 lea T1, [.imm0 xWrtRIP]
5245 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5246 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5247 %else
5248 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5249 %endif
5250 lea T1, [T1 + T0*2]
5251 IBT_NOTRACK
5252 call T1
5253 movdqu [A0], xmm0
5254
5255 IEMIMPL_SSE_EPILOGUE
5256 EPILOGUE_3_ARGS
5257 %assign bImm 0
5258 %rep 256
5259.imm %+ bImm:
5260 IBT_ENDBRxx_WITHOUT_NOTRACK
5261 shufps xmm0, xmm1, bImm
5262 ret
5263 int3
5264 %assign bImm bImm + 1
5265 %endrep
5266.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5267ENDPROC iemAImpl_shufps_u128
5268
5269
5270;;
5271; shufpd instruction with 8-bit immediates.
5272;
5273; @param A0 Pointer to the destination media register size operand (input/output).
5274; @param A1 Pointer to the first source media register size operand (input).
5275; @param A2 The 8-bit immediate
5276;
5277BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5278 PROLOGUE_3_ARGS
5279 IEMIMPL_SSE_PROLOGUE
5280
5281 movzx A2, A2_8 ; must clear top bits
5282 movdqu xmm0, [A0]
5283 movdqu xmm1, [A1]
5284 lea T1, [.imm0 xWrtRIP]
5285 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5286 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5287 %else
5288 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5289 %endif
5290 lea T1, [T1 + T0*2]
5291 IBT_NOTRACK
5292 call T1
5293 movdqu [A0], xmm0
5294
5295 IEMIMPL_SSE_EPILOGUE
5296 EPILOGUE_3_ARGS
5297 %assign bImm 0
5298 %rep 256
5299.imm %+ bImm:
5300 IBT_ENDBRxx_WITHOUT_NOTRACK
5301 shufpd xmm0, xmm1, bImm
5302 ret
5303 %assign bImm bImm + 1
5304 %endrep
5305.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5306ENDPROC iemAImpl_shufpd_u128
5307
5308
5309;;
5310; vshufp{s,d} instructions with 8-bit immediates.
5311;
5312; @param 1 The instruction name.
5313;
5314; @param A0 Pointer to the destination media register size operand (output).
5315; @param A1 Pointer to the first source media register size operand (input).
5316; @param A2 Pointer to the second source media register size operand (input).
5317; @param A3 The 8-bit immediate
5318;
5319%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5320BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5321 PROLOGUE_4_ARGS
5322 IEMIMPL_AVX_PROLOGUE
5323
5324 movzx A3, A3_8 ; must clear top bits
5325 movdqu xmm0, [A1]
5326 movdqu xmm1, [A2]
5327 lea T1, [.imm0 xWrtRIP]
5328 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5329 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5330 %else
5331 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5332 %endif
5333 lea T1, [T1 + T0*2]
5334 IBT_NOTRACK
5335 call T1
5336 movdqu [A0], xmm0
5337
5338 IEMIMPL_AVX_EPILOGUE
5339 EPILOGUE_4_ARGS
5340 %assign bImm 0
5341 %rep 256
5342.imm %+ bImm:
5343 IBT_ENDBRxx_WITHOUT_NOTRACK
5344 %1 xmm0, xmm0, xmm1, bImm
5345 ret
5346 %assign bImm bImm + 1
5347 %endrep
5348.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5349ENDPROC iemAImpl_ %+ %1 %+ _u128
5350
5351BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5352 PROLOGUE_4_ARGS
5353 IEMIMPL_AVX_PROLOGUE
5354
5355 movzx A3, A3_8 ; must clear top bits
5356 vmovdqu ymm0, [A1]
5357 vmovdqu ymm1, [A2]
5358 lea T1, [.imm0 xWrtRIP]
5359 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5360 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5361 %else
5362 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5363 %endif
5364 lea T1, [T1 + T0*2]
5365 IBT_NOTRACK
5366 call T1
5367 vmovdqu [A0], ymm0
5368
5369 IEMIMPL_AVX_EPILOGUE
5370 EPILOGUE_4_ARGS
5371 %assign bImm 0
5372 %rep 256
5373.imm %+ bImm:
5374 IBT_ENDBRxx_WITHOUT_NOTRACK
5375 %1 ymm0, ymm0, ymm1, bImm
5376 ret
5377 %assign bImm bImm + 1
5378 %endrep
5379.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5380ENDPROC iemAImpl_ %+ %1 %+ _u256
5381%endmacro
5382
5383IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5384IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5385
5386
5387;;
5388; One of the [p]blendv{b,ps,pd} variants
5389;
5390; @param 1 The instruction
5391;
5392; @param A0 Pointer to the first media register sized operand (input/output).
5393; @param A1 Pointer to the second media sized value (input).
5394; @param A2 Pointer to the media register sized mask value (input).
5395;
5396%macro IEMIMPL_P_BLEND 1
5397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5398 PROLOGUE_3_ARGS
5399 IEMIMPL_SSE_PROLOGUE
5400
5401 movdqu xmm0, [A2] ; This is implicit
5402 movdqu xmm1, [A0]
5403 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5404 %1 xmm1, xmm2
5405 movdqu [A0], xmm1
5406
5407 IEMIMPL_SSE_PROLOGUE
5408 EPILOGUE_3_ARGS
5409ENDPROC iemAImpl_ %+ %1 %+ _u128
5410%endmacro
5411
5412IEMIMPL_P_BLEND pblendvb
5413IEMIMPL_P_BLEND blendvps
5414IEMIMPL_P_BLEND blendvpd
5415
5416
5417;;
5418; One of the v[p]blendv{b,ps,pd} variants
5419;
5420; @param 1 The instruction
5421;
5422; @param A0 Pointer to the first media register sized operand (output).
5423; @param A1 Pointer to the first media register sized operand (input).
5424; @param A2 Pointer to the second media register sized operand (input).
5425; @param A3 Pointer to the media register sized mask value (input).
5426%macro IEMIMPL_AVX_P_BLEND 1
5427BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5428 PROLOGUE_4_ARGS
5429 IEMIMPL_AVX_PROLOGUE
5430
5431 vmovdqu xmm0, [A1]
5432 vmovdqu xmm1, [A2]
5433 vmovdqu xmm2, [A3]
5434 %1 xmm0, xmm0, xmm1, xmm2
5435 vmovdqu [A0], xmm0
5436
5437 IEMIMPL_AVX_PROLOGUE
5438 EPILOGUE_4_ARGS
5439ENDPROC iemAImpl_ %+ %1 %+ _u128
5440
5441BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5442 PROLOGUE_4_ARGS
5443 IEMIMPL_AVX_PROLOGUE
5444
5445 vmovdqu ymm0, [A1]
5446 vmovdqu ymm1, [A2]
5447 vmovdqu ymm2, [A3]
5448 %1 ymm0, ymm0, ymm1, ymm2
5449 vmovdqu [A0], ymm0
5450
5451 IEMIMPL_AVX_PROLOGUE
5452 EPILOGUE_4_ARGS
5453ENDPROC iemAImpl_ %+ %1 %+ _u256
5454%endmacro
5455
5456IEMIMPL_AVX_P_BLEND vpblendvb
5457IEMIMPL_AVX_P_BLEND vblendvps
5458IEMIMPL_AVX_P_BLEND vblendvpd
5459
5460
5461;;
5462; palignr mm1, mm2/m64 instruction.
5463;
5464; @param A0 Pointer to the first media register sized operand (output).
5465; @param A1 The second register sized operand (input).
5466; @param A2 The 8-bit immediate.
5467BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5468 PROLOGUE_3_ARGS
5469 IEMIMPL_MMX_PROLOGUE
5470
5471 movzx A2, A2_8 ; must clear top bits
5472 movq mm0, [A0]
5473 movq mm1, A1
5474 lea T1, [.imm0 xWrtRIP]
5475 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5476 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5477 %else
5478 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5479 %endif
5480 lea T1, [T1 + T0*2]
5481 IBT_NOTRACK
5482 call T1
5483 movq [A0], mm0
5484
5485 IEMIMPL_MMX_EPILOGUE
5486 EPILOGUE_3_ARGS
5487 %assign bImm 0
5488 %rep 256
5489.imm %+ bImm:
5490 IBT_ENDBRxx_WITHOUT_NOTRACK
5491 palignr mm0, mm1, bImm
5492 ret
5493 %assign bImm bImm + 1
5494 %endrep
5495.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5496ENDPROC iemAImpl_palignr_u64
5497
5498
5499;;
5500; SSE instructions with 8-bit immediates of the form
5501; xxx xmm1, xmm2, imm8.
5502; where the instruction encoding takes up 6 bytes.
5503;
5504; @param 1 The instruction name.
5505;
5506; @param A0 Pointer to the first media register size operand (input/output).
5507; @param A1 Pointer to the second source media register size operand (input).
5508; @param A2 The 8-bit immediate
5509;
5510%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5511BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5512 PROLOGUE_3_ARGS
5513 IEMIMPL_SSE_PROLOGUE
5514
5515 movzx A2, A2_8 ; must clear top bits
5516 movdqu xmm0, [A0]
5517 movdqu xmm1, [A1]
5518 lea T1, [.imm0 xWrtRIP]
5519 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5520 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5521 lea T1, [T1 + T0*4]
5522 %else
5523 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5524 %endif
5525 IBT_NOTRACK
5526 call T1
5527 movdqu [A0], xmm0
5528
5529 IEMIMPL_SSE_EPILOGUE
5530 EPILOGUE_3_ARGS
5531 %assign bImm 0
5532 %rep 256
5533.imm %+ bImm:
5534 IBT_ENDBRxx_WITHOUT_NOTRACK
5535 %1 xmm0, xmm1, bImm
5536 ret
5537 int3
5538 %assign bImm bImm + 1
5539 %endrep
5540.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5541ENDPROC iemAImpl_ %+ %1 %+ _u128
5542%endmacro
5543
5544IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5545IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5546IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5547IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5548IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5549IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5550IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5551
5552
5553;;
5554; AVX instructions with 8-bit immediates of the form
5555; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5556; where the instruction encoding takes up 6 bytes.
5557;
5558; @param 1 The instruction name.
5559; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5560; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5561;
5562; @param A0 Pointer to the destination media register size operand (output).
5563; @param A1 Pointer to the first source media register size operand (input).
5564; @param A2 Pointer to the second source media register size operand (input).
5565; @param A3 The 8-bit immediate
5566;
5567%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5568 %if %2 == 1
5569BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5570 PROLOGUE_4_ARGS
5571 IEMIMPL_AVX_PROLOGUE
5572
5573 movzx A3, A3_8 ; must clear top bits
5574 movdqu xmm0, [A1]
5575 movdqu xmm1, [A2]
5576 lea T1, [.imm0 xWrtRIP]
5577 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5578 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5579 lea T1, [T1 + T0*4]
5580 %else
5581 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5582 %endif
5583 IBT_NOTRACK
5584 call T1
5585 movdqu [A0], xmm0
5586
5587 IEMIMPL_AVX_EPILOGUE
5588 EPILOGUE_4_ARGS
5589 %assign bImm 0
5590 %rep 256
5591.imm %+ bImm:
5592 IBT_ENDBRxx_WITHOUT_NOTRACK
5593 %1 xmm0, xmm0, xmm1, bImm
5594 ret
5595 int3
5596 %assign bImm bImm + 1
5597 %endrep
5598.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5599ENDPROC iemAImpl_ %+ %1 %+ _u128
5600 %endif
5601
5602 %if %3 == 1
5603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5604 PROLOGUE_4_ARGS
5605 IEMIMPL_AVX_PROLOGUE
5606
5607 movzx A3, A3_8 ; must clear top bits
5608 vmovdqu ymm0, [A1]
5609 vmovdqu ymm1, [A2]
5610 lea T1, [.imm0 xWrtRIP]
5611 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5612 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5613 lea T1, [T1 + T0*4]
5614 %else
5615 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5616 %endif
5617 IBT_NOTRACK
5618 call T1
5619 vmovdqu [A0], ymm0
5620
5621 IEMIMPL_AVX_EPILOGUE
5622 EPILOGUE_4_ARGS
5623 %assign bImm 0
5624 %rep 256
5625.imm %+ bImm:
5626 IBT_ENDBRxx_WITHOUT_NOTRACK
5627 %1 ymm0, ymm0, ymm1, bImm
5628 ret
5629 int3
5630 %assign bImm bImm + 1
5631 %endrep
5632.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5633ENDPROC iemAImpl_ %+ %1 %+ _u256
5634 %endif
5635%endmacro
5636
5637IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5638IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5639IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5640IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5641IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5642IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5643IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5644IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5645IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5646
5647
5648;;
5649; AVX instructions with 8-bit immediates of the form
5650; xxx {x,y}mm1, {x,y}mm2, imm8.
5651; where the instruction encoding takes up 6 bytes.
5652;
5653; @param 1 The instruction name.
5654; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5655; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5656;
5657; @param A0 Pointer to the destination media register size operand (output).
5658; @param A1 Pointer to the first source media register size operand (input).
5659; @param A2 The 8-bit immediate
5660;
5661%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 3
5662 %if %2 == 1
5663BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5664 PROLOGUE_4_ARGS
5665 IEMIMPL_AVX_PROLOGUE
5666
5667 movzx A2, A2_8 ; must clear top bits
5668 movdqu xmm1, [A1]
5669 lea T1, [.imm0 xWrtRIP]
5670 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5671 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5672 lea T1, [T1 + T0*4]
5673 %else
5674 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5675 %endif
5676 IBT_NOTRACK
5677 call T1
5678 movdqu [A0], xmm0
5679
5680 IEMIMPL_AVX_EPILOGUE
5681 EPILOGUE_4_ARGS
5682 %assign bImm 0
5683 %rep 256
5684.imm %+ bImm:
5685 IBT_ENDBRxx_WITHOUT_NOTRACK
5686 %1 xmm0, xmm1, bImm
5687 ret
5688 int3
5689 %assign bImm bImm + 1
5690 %endrep
5691.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5692ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5693 %endif
5694
5695 %if %3 == 1
5696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5697 PROLOGUE_4_ARGS
5698 IEMIMPL_AVX_PROLOGUE
5699
5700 movzx A2, A2_8 ; must clear top bits
5701 vmovdqu ymm1, [A1]
5702 lea T1, [.imm0 xWrtRIP]
5703 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5704 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5705 lea T1, [T1 + T0*4]
5706 %else
5707 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5708 %endif
5709 IBT_NOTRACK
5710 call T1
5711 vmovdqu [A0], ymm0
5712
5713 IEMIMPL_AVX_EPILOGUE
5714 EPILOGUE_4_ARGS
5715 %assign bImm 0
5716 %rep 256
5717.imm %+ bImm:
5718 IBT_ENDBRxx_WITHOUT_NOTRACK
5719 %1 ymm0, ymm1, bImm
5720 ret
5721 int3
5722 %assign bImm bImm + 1
5723 %endrep
5724.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5725ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5726 %endif
5727%endmacro
5728
5729IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilps, 1, 1
5730IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilpd, 1, 1
5731
5732
5733;;
5734; Need to move this as well somewhere better?
5735;
5736struc IEMPCMPISTRXSRC
5737 .uSrc1 resd 4
5738 .uSrc2 resd 4
5739endstruc
5740
5741struc IEMPCMPESTRXSRC
5742 .uSrc1 resd 4
5743 .uSrc2 resd 4
5744 .u64Rax resd 2
5745 .u64Rdx resd 2
5746endstruc
5747
5748;;
5749; The pcmpistri instruction.
5750;
5751; @param A0 Pointer to the ECX register to store the result to (output).
5752; @param A1 Pointer to the EFLAGS register.
5753; @param A2 Pointer to the structure containing the source operands (input).
5754; @param A3 The 8-bit immediate
5755;
5756BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5757 PROLOGUE_4_ARGS
5758 IEMIMPL_SSE_PROLOGUE
5759
5760 movzx A3, A3_8 ; must clear top bits
5761 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5762 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5763 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5764 lea T1, [.imm0 xWrtRIP]
5765 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5766 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5767 lea T1, [T1 + T0*4]
5768 %else
5769 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5770 %endif
5771 IBT_NOTRACK
5772 call T1
5773
5774 IEM_SAVE_FLAGS A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5775 mov [T2], ecx
5776
5777 IEMIMPL_SSE_EPILOGUE
5778 EPILOGUE_4_ARGS
5779 %assign bImm 0
5780 %rep 256
5781.imm %+ bImm:
5782 IBT_ENDBRxx_WITHOUT_NOTRACK
5783 pcmpistri xmm0, xmm1, bImm
5784 ret
5785 int3
5786 %assign bImm bImm + 1
5787 %endrep
5788.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5789ENDPROC iemAImpl_pcmpistri_u128
5790
5791;;
5792; The pcmpestri instruction.
5793;
5794; @param A0 Pointer to the ECX register to store the result to (output).
5795; @param A1 Pointer to the EFLAGS register.
5796; @param A2 Pointer to the structure containing the source operands (input).
5797; @param A3 The 8-bit immediate
5798;
5799BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5800 PROLOGUE_4_ARGS
5801 IEMIMPL_SSE_PROLOGUE
5802
5803 movzx A3, A3_8 ; must clear top bits
5804 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5805 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5806 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5807 lea T1, [.imm0 xWrtRIP]
5808 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5809 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5810 lea T1, [T1 + T0*4]
5811 %else
5812 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5813 %endif
5814 push xDX ; xDX can be A1 or A2 depending on the calling convention
5815 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5816 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5817 IBT_NOTRACK
5818 call T1
5819
5820 pop xDX
5821 IEM_SAVE_FLAGS A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5822 mov [T2], ecx
5823
5824 IEMIMPL_SSE_EPILOGUE
5825 EPILOGUE_4_ARGS
5826 %assign bImm 0
5827 %rep 256
5828.imm %+ bImm:
5829 IBT_ENDBRxx_WITHOUT_NOTRACK
5830 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5831 pcmpestri xmm0, xmm1, bImm
5832 ret
5833 %assign bImm bImm + 1
5834 %endrep
5835.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5836ENDPROC iemAImpl_pcmpestri_u128
5837
5838;;
5839; The pcmpistrm instruction template.
5840;
5841; @param A0 Pointer to the XMM0 register to store the result to (output).
5842; @param A1 Pointer to the EFLAGS register.
5843; @param A2 Pointer to the structure containing the source operands (input).
5844; @param A3 The 8-bit immediate
5845;
5846BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5847 PROLOGUE_4_ARGS
5848 IEMIMPL_SSE_PROLOGUE
5849
5850 movzx A3, A3_8 ; must clear top bits
5851 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5852 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5853 lea T1, [.imm0 xWrtRIP]
5854 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5855 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5856 lea T1, [T1 + T0*4]
5857 %else
5858 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5859 %endif
5860 IBT_NOTRACK
5861 call T1
5862
5863 IEM_SAVE_FLAGS A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5864 movdqu [A0], xmm0
5865
5866 IEMIMPL_SSE_EPILOGUE
5867 EPILOGUE_4_ARGS
5868 %assign bImm 0
5869 %rep 256
5870.imm %+ bImm:
5871 IBT_ENDBRxx_WITHOUT_NOTRACK
5872 pcmpistrm xmm1, xmm2, bImm
5873 ret
5874 int3
5875 %assign bImm bImm + 1
5876 %endrep
5877.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5878ENDPROC iemAImpl_pcmpistrm_u128
5879
5880;;
5881; The pcmpestrm instruction template.
5882;
5883; @param A0 Pointer to the XMM0 register to store the result to (output).
5884; @param A1 Pointer to the EFLAGS register.
5885; @param A2 Pointer to the structure containing the source operands (input).
5886; @param A3 The 8-bit immediate
5887;
5888BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5889 PROLOGUE_4_ARGS
5890 IEMIMPL_SSE_PROLOGUE
5891
5892 movzx A3, A3_8 ; must clear top bits
5893 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5894 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5895 lea T1, [.imm0 xWrtRIP]
5896 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5897 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5898 lea T1, [T1 + T0*4]
5899 %else
5900 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5901 %endif
5902 push xDX ; xDX can be A1 or A2 depending on the calling convention
5903 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5904 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5905 IBT_NOTRACK
5906 call T1
5907
5908 pop xDX
5909 IEM_SAVE_FLAGS A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5910 movdqu [A0], xmm0
5911
5912 IEMIMPL_SSE_EPILOGUE
5913 EPILOGUE_4_ARGS
5914 %assign bImm 0
5915 %rep 256
5916.imm %+ bImm:
5917 IBT_ENDBRxx_WITHOUT_NOTRACK
5918 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5919 pcmpestrm xmm1, xmm2, bImm
5920 ret
5921 %assign bImm bImm + 1
5922 %endrep
5923.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5924ENDPROC iemAImpl_pcmpestrm_u128
5925
5926
5927;;
5928; pinsrw instruction.
5929;
5930; @param A0 Pointer to the first media register size operand (input/output).
5931; @param A1 The 16 bit input operand (input).
5932; @param A2 The 8-bit immediate
5933;
5934BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5935 PROLOGUE_3_ARGS
5936 IEMIMPL_SSE_PROLOGUE
5937
5938 movzx A2, A2_8 ; must clear top bits
5939 movq mm0, [A0]
5940 lea T1, [.imm0 xWrtRIP]
5941 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5942 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5943 %else
5944 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5945 %endif
5946 lea T1, [T1 + T0]
5947 IBT_NOTRACK
5948 call T1
5949 movq [A0], mm0
5950
5951 IEMIMPL_SSE_EPILOGUE
5952 EPILOGUE_3_ARGS
5953 %assign bImm 0
5954 %rep 256
5955.imm %+ bImm:
5956 IBT_ENDBRxx_WITHOUT_NOTRACK
5957 pinsrw mm0, A1_32, bImm
5958 ret
5959 %assign bImm bImm + 1
5960 %endrep
5961.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5962ENDPROC iemAImpl_pinsrw_u64
5963
5964BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5965 PROLOGUE_3_ARGS
5966 IEMIMPL_SSE_PROLOGUE
5967
5968 movzx A2, A2_8 ; must clear top bits
5969 movdqu xmm0, [A0]
5970 lea T1, [.imm0 xWrtRIP]
5971 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5972 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5973 %else
5974 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5975 %endif
5976 lea T1, [T1 + T0*2]
5977 IBT_NOTRACK
5978 call T1
5979 movdqu [A0], xmm0
5980
5981 IEMIMPL_SSE_EPILOGUE
5982 EPILOGUE_3_ARGS
5983 %assign bImm 0
5984 %rep 256
5985.imm %+ bImm:
5986 IBT_ENDBRxx_WITHOUT_NOTRACK
5987 pinsrw xmm0, A1_32, bImm
5988 ret
5989 %assign bImm bImm + 1
5990 %endrep
5991.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5992ENDPROC iemAImpl_pinsrw_u128
5993
5994;;
5995; vpinsrw instruction.
5996;
5997; @param A0 Pointer to the first media register size operand (output).
5998; @param A1 Pointer to the source media register size operand (input).
5999; @param A2 The 16 bit input operand (input).
6000; @param A3 The 8-bit immediate
6001;
6002BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
6003 PROLOGUE_4_ARGS
6004 IEMIMPL_SSE_PROLOGUE
6005
6006 movzx A3, A3_8 ; must clear top bits
6007 movdqu xmm0, [A1]
6008 lea T1, [.imm0 xWrtRIP]
6009 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6010 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
6011 %else
6012 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
6013 %endif
6014 lea T1, [T1 + T0*2]
6015 mov A1, A2 ; A2 requires longer encoding on Windows
6016 IBT_NOTRACK
6017 call T1
6018 movdqu [A0], xmm0
6019
6020 IEMIMPL_SSE_EPILOGUE
6021 EPILOGUE_4_ARGS
6022 %assign bImm 0
6023 %rep 256
6024.imm %+ bImm:
6025 IBT_ENDBRxx_WITHOUT_NOTRACK
6026 vpinsrw xmm0, xmm0, A1_32, bImm
6027 ret
6028 %assign bImm bImm + 1
6029 %endrep
6030.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6031ENDPROC iemAImpl_vpinsrw_u128
6032
6033
6034;;
6035; movmskp{s,d} SSE instruction template
6036;
6037; @param 1 The SSE instruction name.
6038; @param 2 The AVX instruction name.
6039;
6040; @param A0 Pointer to the output register (output/byte sized).
6041; @param A1 Pointer to the source media register size operand (input).
6042;
6043%macro IEMIMPL_MEDIA_MOVMSK_P 2
6044BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6045 PROLOGUE_2_ARGS
6046 IEMIMPL_SSE_PROLOGUE
6047
6048 movdqu xmm0, [A1]
6049 %1 T0, xmm0
6050 mov byte [A0], T0_8
6051
6052 IEMIMPL_SSE_EPILOGUE
6053 EPILOGUE_2_ARGS
6054ENDPROC iemAImpl_ %+ %1 %+ _u128
6055
6056BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6057 PROLOGUE_2_ARGS
6058 IEMIMPL_AVX_PROLOGUE
6059
6060 movdqu xmm0, [A1]
6061 %2 T0, xmm0
6062 mov byte [A0], T0_8
6063
6064 IEMIMPL_AVX_EPILOGUE
6065 EPILOGUE_2_ARGS
6066ENDPROC iemAImpl_ %+ %2 %+ _u128
6067
6068BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6069 PROLOGUE_2_ARGS
6070 IEMIMPL_AVX_PROLOGUE
6071
6072 vmovdqu ymm0, [A1]
6073 %2 T0, ymm0
6074 mov byte [A0], T0_8
6075
6076 IEMIMPL_AVX_EPILOGUE
6077 EPILOGUE_2_ARGS
6078ENDPROC iemAImpl_ %+ %2 %+ _u256
6079%endmacro
6080
6081IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6082IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6083
6084
6085;;
6086; Restores the SSE MXCSR register with the original value.
6087;
6088; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6089; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6090; @param 2 Expression giving the address of the FXSTATE of the guest.
6091;
6092; @note Restores the stack pointer.
6093;
6094%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
6095 sub xSP, 4
6096 stmxcsr [xSP]
6097 mov T0_32, [xSP]
6098 add xSP, 4
6099 ; Merge the status bits into the original MXCSR value.
6100 mov T1_32, [%2 + X86FXSTATE.MXCSR]
6101 and T0_32, X86_MXCSR_XCPT_FLAGS
6102 or T0_32, T1_32
6103 mov [%1], T0_32
6104
6105 ldmxcsr [xSP]
6106 add xSP, 4
6107%endmacro
6108
6109
6110;;
6111; cvttsd2si instruction - 32-bit variant.
6112;
6113; @param A0 FPU context (FXSTATE or XSAVEAREA).
6114; @param A1 Where to return the MXCSR value.
6115; @param A2 Pointer to the result operand (output).
6116; @param A3 Pointer to the second operand (input).
6117;
6118BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6119 PROLOGUE_4_ARGS
6120 IEMIMPL_SSE_PROLOGUE
6121 SSE_LD_FXSTATE_MXCSR A0
6122
6123 cvttsd2si T0_32, [A3]
6124 mov dword [A2], T0_32
6125
6126 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6127 IEMIMPL_SSE_EPILOGUE
6128 EPILOGUE_4_ARGS
6129ENDPROC iemAImpl_cvttsd2si_i32_r64
6130
6131;;
6132; cvttsd2si instruction - 64-bit variant.
6133;
6134; @param A0 FPU context (FXSTATE or XSAVEAREA).
6135; @param A1 Where to return the MXCSR value.
6136; @param A2 Pointer to the result operand (output).
6137; @param A3 Pointer to the second operand (input).
6138;
6139BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6140 PROLOGUE_4_ARGS
6141 IEMIMPL_SSE_PROLOGUE
6142 SSE_LD_FXSTATE_MXCSR A0
6143
6144 cvttsd2si T0, [A3]
6145 mov qword [A2], T0
6146
6147 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6148 IEMIMPL_SSE_EPILOGUE
6149 EPILOGUE_4_ARGS
6150ENDPROC iemAImpl_cvttsd2si_i64_r64
6151
6152
6153;;
6154; cvtsd2si instruction - 32-bit variant.
6155;
6156; @param A0 FPU context (FXSTATE or XSAVEAREA).
6157; @param A1 Where to return the MXCSR value.
6158; @param A2 Pointer to the result operand (output).
6159; @param A3 Pointer to the second operand (input).
6160;
6161BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6162 PROLOGUE_4_ARGS
6163 IEMIMPL_SSE_PROLOGUE
6164 SSE_LD_FXSTATE_MXCSR A0
6165
6166 cvtsd2si T0_32, [A3]
6167 mov dword [A2], T0_32
6168
6169 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6170 IEMIMPL_SSE_EPILOGUE
6171 EPILOGUE_4_ARGS
6172ENDPROC iemAImpl_cvtsd2si_i32_r64
6173
6174;;
6175; cvtsd2si instruction - 64-bit variant.
6176;
6177; @param A0 FPU context (FXSTATE or XSAVEAREA).
6178; @param A1 Where to return the MXCSR value.
6179; @param A2 Pointer to the result operand (output).
6180; @param A3 Pointer to the second operand (input).
6181;
6182BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6183 PROLOGUE_4_ARGS
6184 IEMIMPL_SSE_PROLOGUE
6185 SSE_LD_FXSTATE_MXCSR A0
6186
6187 cvtsd2si T0, [A3]
6188 mov qword [A2], T0
6189
6190 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6191 IEMIMPL_SSE_EPILOGUE
6192 EPILOGUE_4_ARGS
6193ENDPROC iemAImpl_cvtsd2si_i64_r64
6194
6195
6196;;
6197; cvttss2si instruction - 32-bit variant.
6198;
6199; @param A0 FPU context (FXSTATE or XSAVEAREA).
6200; @param A1 Where to return the MXCSR value.
6201; @param A2 Pointer to the result operand (output).
6202; @param A3 Pointer to the second operand (input).
6203;
6204BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6205 PROLOGUE_4_ARGS
6206 IEMIMPL_SSE_PROLOGUE
6207 SSE_LD_FXSTATE_MXCSR A0
6208
6209 cvttss2si T0_32, [A3]
6210 mov dword [A2], T0_32
6211
6212 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6213 IEMIMPL_SSE_EPILOGUE
6214 EPILOGUE_4_ARGS
6215ENDPROC iemAImpl_cvttss2si_i32_r32
6216
6217;;
6218; cvttss2si instruction - 64-bit variant.
6219;
6220; @param A0 FPU context (FXSTATE or XSAVEAREA).
6221; @param A1 Where to return the MXCSR value.
6222; @param A2 Pointer to the result operand (output).
6223; @param A3 Pointer to the second operand (input).
6224;
6225BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6226 PROLOGUE_4_ARGS
6227 IEMIMPL_SSE_PROLOGUE
6228 SSE_LD_FXSTATE_MXCSR A0
6229
6230 cvttss2si T0, [A3]
6231 mov qword [A2], T0
6232
6233 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6234 IEMIMPL_SSE_EPILOGUE
6235 EPILOGUE_4_ARGS
6236ENDPROC iemAImpl_cvttss2si_i64_r32
6237
6238
6239;;
6240; cvtss2si instruction - 32-bit variant.
6241;
6242; @param A0 FPU context (FXSTATE or XSAVEAREA).
6243; @param A1 Where to return the MXCSR value.
6244; @param A2 Pointer to the result operand (output).
6245; @param A3 Pointer to the second operand (input).
6246;
6247BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6248 PROLOGUE_4_ARGS
6249 IEMIMPL_SSE_PROLOGUE
6250 SSE_LD_FXSTATE_MXCSR A0
6251
6252 cvtss2si T0_32, [A3]
6253 mov dword [A2], T0_32
6254
6255 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6256 IEMIMPL_SSE_EPILOGUE
6257 EPILOGUE_4_ARGS
6258ENDPROC iemAImpl_cvtss2si_i32_r32
6259
6260;;
6261; cvtss2si instruction - 64-bit variant.
6262;
6263; @param A0 FPU context (FXSTATE or XSAVEAREA).
6264; @param A1 Where to return the MXCSR value.
6265; @param A2 Pointer to the result operand (output).
6266; @param A3 Pointer to the second operand (input).
6267;
6268BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6269 PROLOGUE_4_ARGS
6270 IEMIMPL_SSE_PROLOGUE
6271 SSE_LD_FXSTATE_MXCSR A0
6272
6273 cvtss2si T0, [A3]
6274 mov qword [A2], T0
6275
6276 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6277 IEMIMPL_SSE_EPILOGUE
6278 EPILOGUE_4_ARGS
6279ENDPROC iemAImpl_cvtss2si_i64_r32
6280
6281
6282;;
6283; cvtsi2ss instruction - 32-bit variant.
6284;
6285; @param A0 FPU context (FXSTATE or XSAVEAREA).
6286; @param A1 Where to return the MXCSR value.
6287; @param A2 Pointer to the result operand (output).
6288; @param A3 Pointer to the second operand (input).
6289;
6290BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6291 PROLOGUE_4_ARGS
6292 IEMIMPL_SSE_PROLOGUE
6293 SSE_LD_FXSTATE_MXCSR A0
6294
6295 cvtsi2ss xmm0, dword [A3]
6296 movd dword [A2], xmm0
6297
6298 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6299 IEMIMPL_SSE_EPILOGUE
6300 EPILOGUE_4_ARGS
6301ENDPROC iemAImpl_cvtsi2ss_r32_i32
6302
6303;;
6304; cvtsi2ss instruction - 64-bit variant.
6305;
6306; @param A0 FPU context (FXSTATE or XSAVEAREA).
6307; @param A1 Where to return the MXCSR value.
6308; @param A2 Pointer to the result operand (output).
6309; @param A3 Pointer to the second operand (input).
6310;
6311BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6312 PROLOGUE_4_ARGS
6313 IEMIMPL_SSE_PROLOGUE
6314 SSE_LD_FXSTATE_MXCSR A0
6315
6316 cvtsi2ss xmm0, qword [A3]
6317 movd dword [A2], xmm0
6318
6319 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6320 IEMIMPL_SSE_EPILOGUE
6321 EPILOGUE_4_ARGS
6322ENDPROC iemAImpl_cvtsi2ss_r32_i64
6323
6324
6325;;
6326; cvtsi2sd instruction - 32-bit variant.
6327;
6328; @param A0 FPU context (FXSTATE or XSAVEAREA).
6329; @param A1 Where to return the MXCSR value.
6330; @param A2 Pointer to the result operand (output).
6331; @param A3 Pointer to the second operand (input).
6332;
6333BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6334 PROLOGUE_4_ARGS
6335 IEMIMPL_SSE_PROLOGUE
6336 SSE_LD_FXSTATE_MXCSR A0
6337
6338 cvtsi2sd xmm0, dword [A3]
6339 movq [A2], xmm0
6340
6341 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6342 IEMIMPL_SSE_EPILOGUE
6343 EPILOGUE_4_ARGS
6344ENDPROC iemAImpl_cvtsi2sd_r64_i32
6345
6346;;
6347; cvtsi2sd instruction - 64-bit variant.
6348;
6349; @param A0 FPU context (FXSTATE or XSAVEAREA).
6350; @param A1 Where to return the MXCSR value.
6351; @param A2 Pointer to the result operand (output).
6352; @param A3 Pointer to the second operand (input).
6353;
6354BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6355 PROLOGUE_4_ARGS
6356 IEMIMPL_SSE_PROLOGUE
6357 SSE_LD_FXSTATE_MXCSR A0
6358
6359 cvtsi2sd xmm0, qword [A3]
6360 movq [A2], xmm0
6361
6362 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6363 IEMIMPL_SSE_EPILOGUE
6364 EPILOGUE_4_ARGS
6365ENDPROC iemAImpl_cvtsi2sd_r64_i64
6366
6367
6368;;
6369; Initialize the SSE MXCSR register using the guest value partially to
6370; account for rounding mode.
6371;
6372; @uses 4 bytes of stack to save the original value, T0.
6373; @param 1 Expression giving the address of the MXCSR register of the guest.
6374;
6375%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6376 sub xSP, 4
6377
6378 stmxcsr [xSP]
6379 mov T0_32, [%1]
6380 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6381 or T0_32, X86_MXCSR_XCPT_MASK
6382 sub xSP, 4
6383 mov [xSP], T0_32
6384 ldmxcsr [xSP]
6385 add xSP, 4
6386%endmacro
6387
6388
6389;;
6390; Restores the SSE MXCSR register with the original value.
6391;
6392; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6393; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6394;
6395; @note Restores the stack pointer.
6396;
6397%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6398 sub xSP, 4
6399 stmxcsr [xSP]
6400 mov T0_32, [xSP]
6401 add xSP, 4
6402 ; Merge the status bits into the original MXCSR value.
6403 mov T1_32, [%1]
6404 and T0_32, X86_MXCSR_XCPT_FLAGS
6405 or T0_32, T1_32
6406 mov [%1], T0_32
6407
6408 ldmxcsr [xSP]
6409 add xSP, 4
6410%endmacro
6411
6412
6413;
6414; UCOMISS (SSE)
6415;
6416; @param A0 Pointer to the MXCSR value (input/output).
6417; @param A1 Pointer to the EFLAGS value (input/output).
6418; @param A2 Pointer to the first source operand (aka readonly destination).
6419; @param A3 Pointer to the second source operand.
6420;
6421BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6422 PROLOGUE_4_ARGS
6423 IEMIMPL_SSE_PROLOGUE
6424 SSE_LD_FXSTATE_MXCSR_ONLY A0
6425
6426 movdqu xmm0, [A2]
6427 movdqu xmm1, [A3]
6428 ucomiss xmm0, xmm1
6429 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6430
6431 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6432 IEMIMPL_SSE_EPILOGUE
6433 EPILOGUE_4_ARGS
6434ENDPROC iemAImpl_ucomiss_u128
6435
6436BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6437 PROLOGUE_4_ARGS
6438 IEMIMPL_SSE_PROLOGUE
6439 SSE_LD_FXSTATE_MXCSR_ONLY A0
6440
6441 movdqu xmm0, [A2]
6442 movdqu xmm1, [A3]
6443 vucomiss xmm0, xmm1
6444 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6445
6446 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6447 IEMIMPL_SSE_EPILOGUE
6448 EPILOGUE_4_ARGS
6449ENDPROC iemAImpl_vucomiss_u128
6450
6451
6452;
6453; UCOMISD (SSE)
6454;
6455; @param A0 Pointer to the MXCSR value (input/output).
6456; @param A1 Pointer to the EFLAGS value (input/output).
6457; @param A2 Pointer to the first source operand (aka readonly destination).
6458; @param A3 Pointer to the second source operand.
6459;
6460BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6461 PROLOGUE_4_ARGS
6462 IEMIMPL_SSE_PROLOGUE
6463 SSE_LD_FXSTATE_MXCSR_ONLY A0
6464
6465 movdqu xmm0, [A2]
6466 movdqu xmm1, [A3]
6467 ucomisd xmm0, xmm1
6468 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6469
6470 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6471 IEMIMPL_SSE_EPILOGUE
6472 EPILOGUE_4_ARGS
6473ENDPROC iemAImpl_ucomisd_u128
6474
6475BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6476 PROLOGUE_4_ARGS
6477 IEMIMPL_SSE_PROLOGUE
6478 SSE_LD_FXSTATE_MXCSR_ONLY A0
6479
6480 movdqu xmm0, [A2]
6481 movdqu xmm1, [A3]
6482 vucomisd xmm0, xmm1
6483 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6484
6485 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6486 IEMIMPL_SSE_EPILOGUE
6487 EPILOGUE_4_ARGS
6488ENDPROC iemAImpl_vucomisd_u128
6489
6490;
6491; COMISS (SSE)
6492;
6493; @param A0 Pointer to the MXCSR value (input/output).
6494; @param A1 Pointer to the EFLAGS value (input/output).
6495; @param A2 Pointer to the first source operand (aka readonly destination).
6496; @param A3 Pointer to the second source operand.
6497;
6498BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6499 PROLOGUE_4_ARGS
6500 IEMIMPL_SSE_PROLOGUE
6501 SSE_LD_FXSTATE_MXCSR_ONLY A0
6502
6503 movdqu xmm0, [A2]
6504 movdqu xmm1, [A3]
6505 comiss xmm0, xmm1
6506 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6507
6508 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6509 IEMIMPL_SSE_EPILOGUE
6510 EPILOGUE_4_ARGS
6511ENDPROC iemAImpl_comiss_u128
6512
6513BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6514 PROLOGUE_4_ARGS
6515 IEMIMPL_SSE_PROLOGUE
6516 SSE_LD_FXSTATE_MXCSR_ONLY A0
6517
6518 movdqu xmm0, [A2]
6519 movdqu xmm1, [A3]
6520 vcomiss xmm0, xmm1
6521 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6522
6523 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6524 IEMIMPL_SSE_EPILOGUE
6525 EPILOGUE_4_ARGS
6526ENDPROC iemAImpl_vcomiss_u128
6527
6528
6529;
6530; COMISD (SSE)
6531;
6532; @param A0 Pointer to the MXCSR value (input/output).
6533; @param A1 Pointer to the EFLAGS value (input/output).
6534; @param A2 Pointer to the first source operand (aka readonly destination).
6535; @param A3 Pointer to the second source operand.
6536;
6537BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6538 PROLOGUE_4_ARGS
6539 IEMIMPL_SSE_PROLOGUE
6540 SSE_LD_FXSTATE_MXCSR_ONLY A0
6541
6542 movdqu xmm0, [A2]
6543 movdqu xmm1, [A3]
6544 comisd xmm0, xmm1
6545 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6546
6547 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6548 IEMIMPL_SSE_EPILOGUE
6549 EPILOGUE_4_ARGS
6550ENDPROC iemAImpl_comisd_u128
6551
6552BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6553 PROLOGUE_4_ARGS
6554 IEMIMPL_SSE_PROLOGUE
6555 SSE_LD_FXSTATE_MXCSR_ONLY A0
6556
6557 movdqu xmm0, [A2]
6558 movdqu xmm1, [A3]
6559 vcomisd xmm0, xmm1
6560 IEM_SAVE_FLAGS A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6561
6562 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6563 IEMIMPL_SSE_EPILOGUE
6564 EPILOGUE_4_ARGS
6565ENDPROC iemAImpl_vcomisd_u128
6566
6567
6568;;
6569; Need to move this as well somewhere better?
6570;
6571struc IEMMEDIAF2XMMSRC
6572 .uSrc1 resd 4
6573 .uSrc2 resd 4
6574endstruc
6575
6576
6577;
6578; CMPPS (SSE)
6579;
6580; @param A0 Pointer to the MXCSR value (input/output).
6581; @param A1 Pointer to the first media register size operand (output).
6582; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6583; @param A3 The 8-bit immediate (input).
6584;
6585BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6586 PROLOGUE_4_ARGS
6587 IEMIMPL_SSE_PROLOGUE
6588 SSE_LD_FXSTATE_MXCSR_ONLY A0
6589
6590 movzx A3, A3_8 ; must clear top bits
6591 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6592 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6593 lea T1, [.imm0 xWrtRIP]
6594 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6595 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6596 %else
6597 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6598 %endif
6599 lea T1, [T1 + T0]
6600 IBT_NOTRACK
6601 call T1
6602 movdqu [A1], xmm0
6603
6604 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6605 IEMIMPL_SSE_EPILOGUE
6606 EPILOGUE_4_ARGS
6607 %assign bImm 0
6608 %rep 256
6609.imm %+ bImm:
6610 IBT_ENDBRxx_WITHOUT_NOTRACK
6611 cmpps xmm0, xmm1, bImm
6612 ret
6613 %assign bImm bImm + 1
6614 %endrep
6615.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6616ENDPROC iemAImpl_cmpps_u128
6617
6618;;
6619; SSE instructions with 8-bit immediates of the form
6620; xxx xmm1, xmm2, imm8.
6621; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6622; register.
6623;
6624; @param 1 The instruction name.
6625;
6626; @param A0 Pointer to the MXCSR value (input/output).
6627; @param A1 Pointer to the first media register size operand (output).
6628; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6629; @param A3 The 8-bit immediate (input).
6630;
6631%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6632BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6633 PROLOGUE_4_ARGS
6634 IEMIMPL_SSE_PROLOGUE
6635 SSE_LD_FXSTATE_MXCSR_ONLY A0
6636
6637 movzx A3, A3_8 ; must clear top bits
6638 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6639 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6640 lea T1, [.imm0 xWrtRIP]
6641 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6642 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6643 %else
6644 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6645 %endif
6646 lea T1, [T1 + T0*2]
6647 IBT_NOTRACK
6648 call T1
6649 movdqu [A1], xmm0
6650
6651 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6652 IEMIMPL_SSE_EPILOGUE
6653 EPILOGUE_4_ARGS
6654 %assign bImm 0
6655 %rep 256
6656.imm %+ bImm:
6657 IBT_ENDBRxx_WITHOUT_NOTRACK
6658 %1 xmm0, xmm1, bImm
6659 ret
6660 %assign bImm bImm + 1
6661 %endrep
6662.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6663ENDPROC iemAImpl_ %+ %1 %+ _u128
6664%endmacro
6665
6666IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6667IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6668IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6669
6670;;
6671; SSE instructions with 8-bit immediates of the form
6672; xxx xmm1, xmm2, imm8.
6673; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6674; register.
6675;
6676; @param 1 The instruction name.
6677;
6678; @param A0 Pointer to the MXCSR value (input/output).
6679; @param A1 Pointer to the first media register size operand (output).
6680; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6681; @param A3 The 8-bit immediate (input).
6682;
6683%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6684BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6685 PROLOGUE_4_ARGS
6686 IEMIMPL_SSE_PROLOGUE
6687 SSE_LD_FXSTATE_MXCSR_ONLY A0
6688
6689 movzx A3, A3_8 ; must clear top bits
6690 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6691 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6692 lea T1, [.imm0 xWrtRIP]
6693 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6694 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6695 lea T1, [T1 + T0*4]
6696 %else
6697 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6698 %endif
6699 IBT_NOTRACK
6700 call T1
6701 movdqu [A1], xmm0
6702
6703 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6704 IEMIMPL_SSE_EPILOGUE
6705 EPILOGUE_4_ARGS
6706 %assign bImm 0
6707 %rep 256
6708.imm %+ bImm:
6709 IBT_ENDBRxx_WITHOUT_NOTRACK
6710 %1 xmm0, xmm1, bImm
6711 ret
6712 int3
6713 %assign bImm bImm + 1
6714 %endrep
6715.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6716ENDPROC iemAImpl_ %+ %1 %+ _u128
6717%endmacro
6718
6719IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6720IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6721IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6722IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6723IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6724IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6725
6726
6727;;
6728; SSE instructions of the form
6729; xxx mm, xmm.
6730; and we need to load and save the MXCSR register.
6731;
6732; @param 1 The instruction name.
6733;
6734; @param A0 Pointer to the MXCSR value (input/output).
6735; @param A1 Pointer to the first MMX register sized operand (output).
6736; @param A2 Pointer to the media register sized operand (input).
6737;
6738%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6739BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6740 PROLOGUE_3_ARGS
6741 IEMIMPL_SSE_PROLOGUE
6742 SSE_LD_FXSTATE_MXCSR_ONLY A0
6743
6744 movdqu xmm0, [A2]
6745 %1 mm0, xmm0
6746 movq [A1], mm0
6747
6748 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6749 IEMIMPL_SSE_EPILOGUE
6750 EPILOGUE_3_ARGS
6751ENDPROC iemAImpl_ %+ %1 %+ _u128
6752%endmacro
6753
6754IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6755IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6756
6757;;
6758; SSE instructions of the form
6759; xxx xmm, xmm/m64.
6760; and we need to load and save the MXCSR register.
6761;
6762; @param 1 The instruction name.
6763;
6764; @param A0 Pointer to the MXCSR value (input/output).
6765; @param A1 Pointer to the first media register sized operand (input/output).
6766; @param A2 The 64bit source value from a MMX media register (input)
6767;
6768%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6770 PROLOGUE_3_ARGS
6771 IEMIMPL_SSE_PROLOGUE
6772 SSE_LD_FXSTATE_MXCSR_ONLY A0
6773
6774 movdqu xmm0, [A1]
6775 movq mm0, A2
6776 %1 xmm0, mm0
6777 movdqu [A1], xmm0
6778
6779 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6780 IEMIMPL_SSE_EPILOGUE
6781 EPILOGUE_3_ARGS
6782ENDPROC iemAImpl_ %+ %1 %+ _u128
6783%endmacro
6784
6785IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6786IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6787
6788;;
6789; SSE instructions of the form
6790; xxx mm, xmm/m64.
6791; and we need to load and save the MXCSR register.
6792;
6793; @param 1 The instruction name.
6794;
6795; @param A0 Pointer to the MXCSR value (input/output).
6796; @param A1 Pointer to the first MMX media register sized operand (output).
6797; @param A2 The 64bit source value (input).
6798;
6799%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6801 PROLOGUE_3_ARGS
6802 IEMIMPL_SSE_PROLOGUE
6803 SSE_LD_FXSTATE_MXCSR_ONLY A0
6804
6805 movq xmm0, A2
6806 %1 mm0, xmm0
6807 movq [A1], mm0
6808
6809 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6810 IEMIMPL_SSE_EPILOGUE
6811 EPILOGUE_3_ARGS
6812ENDPROC iemAImpl_ %+ %1 %+ _u128
6813%endmacro
6814
6815IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6816IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6817
6818;
6819; All forms of RDRAND and RDSEED
6820;
6821; @param A0 Pointer to the destination operand.
6822; @param A1 Pointer to the EFLAGS value (input/output).
6823;
6824%macro IEMIMPL_RDRAND_RDSEED 3
6825BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6826 PROLOGUE_2_ARGS
6827
6828 %1 %2
6829 mov [A0], %2
6830 IEM_SAVE_FLAGS A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
6831
6832 EPILOGUE_2_ARGS
6833ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6834%endmacro
6835
6836IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6837IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6838IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6839IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6840IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6841IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6842
6843
6844;;
6845; sha1rnds4 xmm1, xmm2, imm8.
6846;
6847; @param 1 The instruction name.
6848;
6849; @param A0 Pointer to the first media register size operand (input/output).
6850; @param A1 Pointer to the second source media register size operand (input).
6851; @param A2 The 8-bit immediate
6852;
6853BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6854 PROLOGUE_3_ARGS
6855 IEMIMPL_SSE_PROLOGUE
6856
6857 movzx A2, A2_8 ; must clear top bits
6858 movdqu xmm0, [A0]
6859 movdqu xmm1, [A1]
6860 lea T1, [.imm0 xWrtRIP]
6861 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6862 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6863 %else
6864 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6865 %endif
6866 lea T1, [T1 + T0*2]
6867 IBT_NOTRACK
6868 call T1
6869 movdqu [A0], xmm0
6870
6871 IEMIMPL_SSE_EPILOGUE
6872 EPILOGUE_3_ARGS
6873 %assign bImm 0
6874 %rep 256
6875.imm %+ bImm:
6876 IBT_ENDBRxx_WITHOUT_NOTRACK
6877 sha1rnds4 xmm0, xmm1, bImm
6878 ret
6879 %assign bImm bImm + 1
6880 %endrep
6881.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6882ENDPROC iemAImpl_sha1rnds4_u128
6883
6884
6885;;
6886; sha256rnds2 xmm1, xmm2, <XMM0>.
6887;
6888; @param 1 The instruction name.
6889;
6890; @param A0 Pointer to the first media register size operand (input/output).
6891; @param A1 Pointer to the second source media register size operand (input).
6892; @param A2 Pointer to the implicit XMM0 constants (input).
6893;
6894BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6895 PROLOGUE_3_ARGS
6896 IEMIMPL_SSE_PROLOGUE
6897
6898 movdqu xmm0, [A2]
6899 movdqu xmm1, [A0]
6900 movdqu xmm2, [A1]
6901 sha256rnds2 xmm1, xmm2
6902 movdqu [A0], xmm1
6903
6904 IEMIMPL_SSE_EPILOGUE
6905 EPILOGUE_3_ARGS
6906ENDPROC iemAImpl_sha256rnds2_u128
6907
6908
6909;
6910; 32-bit forms of ADCX and ADOX
6911;
6912; @param A0 Pointer to the destination operand (input/output).
6913; @param A1 32-bit source operand 1 (input).
6914; @param A2 Pointer to the EFLAGS value (input/output).
6915;
6916%macro IEMIMPL_ADX_32 2
6917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6918 PROLOGUE_4_ARGS
6919
6920 IEM_LOAD_FLAGS A2, %2, 0
6921 %1 A1_32, [A0]
6922 mov [A0], A1_32
6923 IEM_SAVE_FLAGS A2, %2, 0, 0
6924
6925 EPILOGUE_4_ARGS
6926ENDPROC iemAImpl_ %+ %1 %+ _u32
6927%endmacro
6928
6929;
6930; 64-bit forms of ADCX and ADOX
6931;
6932; @param A0 Pointer to the destination operand (input/output).
6933; @param A1 64-bit source operand 1 (input).
6934; @param A2 Pointer to the EFLAGS value (input/output).
6935;
6936%macro IEMIMPL_ADX_64 2
6937BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6938 PROLOGUE_4_ARGS
6939
6940 IEM_LOAD_FLAGS A2, %2, 0
6941 %1 A1, [A0]
6942 mov [A0], A1
6943 IEM_SAVE_FLAGS A2, %2, 0, 0
6944
6945 EPILOGUE_4_ARGS
6946ENDPROC iemAImpl_ %+ %1 %+ _u64
6947%endmacro
6948
6949IEMIMPL_ADX_32 adcx, X86_EFL_CF
6950IEMIMPL_ADX_64 adcx, X86_EFL_CF
6951
6952IEMIMPL_ADX_32 adox, X86_EFL_OF
6953IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette