VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 100764

Last change on this file since 100764 was 100709, checked in by vboxsync, 17 months ago

VMM: Added missing splitlock handling for cmpxchg8b and cmpxchg16b, bugref:10052

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 190.5 KB
Line 
1; $Id: IEMAllAImpl.asm 100709 2023-07-26 13:33:34Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90 IBT_ENDBRxx
91%endmacro
92
93
94;
95; We employ some macro assembly here to hid the calling convention differences.
96;
97%ifdef RT_ARCH_AMD64
98 %macro PROLOGUE_1_ARGS 0
99 %endmacro
100 %macro EPILOGUE_1_ARGS 0
101 ret
102 %endmacro
103 %macro EPILOGUE_1_ARGS_EX 0
104 ret
105 %endmacro
106
107 %macro PROLOGUE_2_ARGS 0
108 %endmacro
109 %macro EPILOGUE_2_ARGS 0
110 ret
111 %endmacro
112 %macro EPILOGUE_2_ARGS_EX 1
113 ret
114 %endmacro
115
116 %macro PROLOGUE_3_ARGS 0
117 %endmacro
118 %macro EPILOGUE_3_ARGS 0
119 ret
120 %endmacro
121 %macro EPILOGUE_3_ARGS_EX 1
122 ret
123 %endmacro
124
125 %macro PROLOGUE_4_ARGS 0
126 %endmacro
127 %macro EPILOGUE_4_ARGS 0
128 ret
129 %endmacro
130 %macro EPILOGUE_4_ARGS_EX 1
131 ret
132 %endmacro
133
134 %ifdef ASM_CALL64_GCC
135 %define A0 rdi
136 %define A0_32 edi
137 %define A0_16 di
138 %define A0_8 dil
139
140 %define A1 rsi
141 %define A1_32 esi
142 %define A1_16 si
143 %define A1_8 sil
144
145 %define A2 rdx
146 %define A2_32 edx
147 %define A2_16 dx
148 %define A2_8 dl
149
150 %define A3 rcx
151 %define A3_32 ecx
152 %define A3_16 cx
153 %endif
154
155 %ifdef ASM_CALL64_MSC
156 %define A0 rcx
157 %define A0_32 ecx
158 %define A0_16 cx
159 %define A0_8 cl
160
161 %define A1 rdx
162 %define A1_32 edx
163 %define A1_16 dx
164 %define A1_8 dl
165
166 %define A2 r8
167 %define A2_32 r8d
168 %define A2_16 r8w
169 %define A2_8 r8b
170
171 %define A3 r9
172 %define A3_32 r9d
173 %define A3_16 r9w
174 %endif
175
176 %define T0 rax
177 %define T0_32 eax
178 %define T0_16 ax
179 %define T0_8 al
180
181 %define T1 r11
182 %define T1_32 r11d
183 %define T1_16 r11w
184 %define T1_8 r11b
185
186 %define T2 r10 ; only AMD64
187 %define T2_32 r10d
188 %define T2_16 r10w
189 %define T2_8 r10b
190
191%else
192 ; x86
193 %macro PROLOGUE_1_ARGS 0
194 push edi
195 %endmacro
196 %macro EPILOGUE_1_ARGS 0
197 pop edi
198 ret 0
199 %endmacro
200 %macro EPILOGUE_1_ARGS_EX 1
201 pop edi
202 ret %1
203 %endmacro
204
205 %macro PROLOGUE_2_ARGS 0
206 push edi
207 %endmacro
208 %macro EPILOGUE_2_ARGS 0
209 pop edi
210 ret 0
211 %endmacro
212 %macro EPILOGUE_2_ARGS_EX 1
213 pop edi
214 ret %1
215 %endmacro
216
217 %macro PROLOGUE_3_ARGS 0
218 push ebx
219 mov ebx, [esp + 4 + 4]
220 push edi
221 %endmacro
222 %macro EPILOGUE_3_ARGS_EX 1
223 %if (%1) < 4
224 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
225 %endif
226 pop edi
227 pop ebx
228 ret %1
229 %endmacro
230 %macro EPILOGUE_3_ARGS 0
231 EPILOGUE_3_ARGS_EX 4
232 %endmacro
233
234 %macro PROLOGUE_4_ARGS 0
235 push ebx
236 push edi
237 push esi
238 mov ebx, [esp + 12 + 4 + 0]
239 mov esi, [esp + 12 + 4 + 4]
240 %endmacro
241 %macro EPILOGUE_4_ARGS_EX 1
242 %if (%1) < 8
243 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
244 %endif
245 pop esi
246 pop edi
247 pop ebx
248 ret %1
249 %endmacro
250 %macro EPILOGUE_4_ARGS 0
251 EPILOGUE_4_ARGS_EX 8
252 %endmacro
253
254 %define A0 ecx
255 %define A0_32 ecx
256 %define A0_16 cx
257 %define A0_8 cl
258
259 %define A1 edx
260 %define A1_32 edx
261 %define A1_16 dx
262 %define A1_8 dl
263
264 %define A2 ebx
265 %define A2_32 ebx
266 %define A2_16 bx
267 %define A2_8 bl
268
269 %define A3 esi
270 %define A3_32 esi
271 %define A3_16 si
272
273 %define T0 eax
274 %define T0_32 eax
275 %define T0_16 ax
276 %define T0_8 al
277
278 %define T1 edi
279 %define T1_32 edi
280 %define T1_16 di
281%endif
282
283
284;;
285; Load the relevant flags from [%1] if there are undefined flags (%3).
286;
287; @remarks Clobbers T0, stack. Changes EFLAGS.
288; @param A2 The register pointing to the flags.
289; @param 1 The parameter (A0..A3) pointing to the eflags.
290; @param 2 The set of modified flags.
291; @param 3 The set of undefined flags.
292;
293%macro IEM_MAYBE_LOAD_FLAGS 3
294 ;%if (%3) != 0
295 pushf ; store current flags
296 mov T0_32, [%1] ; load the guest flags
297 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
298 and T0_32, (%2 | %3) ; select the modified and undefined flags.
299 or [xSP], T0 ; merge guest flags with host flags.
300 popf ; load the mixed flags.
301 ;%endif
302%endmacro
303
304;;
305; Load the relevant flags from [%1].
306;
307; @remarks Clobbers T0, stack. Changes EFLAGS.
308; @param A2 The register pointing to the flags.
309; @param 1 The parameter (A0..A3) pointing to the eflags.
310; @param 2 The set of flags to load.
311; @param 3 The set of undefined flags.
312;
313%macro IEM_LOAD_FLAGS 3
314 pushf ; store current flags
315 mov T0_32, [%1] ; load the guest flags
316 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
317 and T0_32, (%2 | %3) ; select the modified and undefined flags.
318 or [xSP], T0 ; merge guest flags with host flags.
319 popf ; load the mixed flags.
320%endmacro
321
322;;
323; Update the flag.
324;
325; @remarks Clobbers T0, T1, stack.
326; @param 1 The register pointing to the EFLAGS.
327; @param 2 The mask of modified flags to save.
328; @param 3 The mask of undefined flags to (maybe) save.
329;
330%macro IEM_SAVE_FLAGS 3
331 %if (%2 | %3) != 0
332 pushf
333 pop T1
334 mov T0_32, [%1] ; flags
335 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
336 and T1_32, (%2 | %3) ; select the modified and undefined flags.
337 or T0_32, T1_32 ; combine the flags.
338 mov [%1], T0_32 ; save the flags.
339 %endif
340%endmacro
341
342;;
343; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
344;
345; @remarks Clobbers T0, T1, stack.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 Mask of additional flags to always set.
350;
351%macro IEM_SAVE_AND_ADJUST_FLAGS 4
352 %if (%2 | %3 | %4) != 0
353 pushf
354 pop T1
355 mov T0_32, [%1] ; load flags.
356 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
357 and T1_32, (%2) ; select the modified flags.
358 or T0_32, T1_32 ; combine the flags.
359 %if (%4) != 0
360 or T0_32, %4 ; add the always set flags.
361 %endif
362 mov [%1], T0_32 ; save the result.
363 %endif
364%endmacro
365
366;;
367; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
368; signed input (%4[%5]) and parity index (%6).
369;
370; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
371; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
372; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
373;
374; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
375; @param 1 The register pointing to the EFLAGS.
376; @param 2 The mask of modified flags to save.
377; @param 3 Mask of additional flags to always clear
378; @param 4 The result register to set SF by.
379; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
380; @param 6 The (full) register containing the parity table index. Will be modified!
381
382%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
383 %ifdef RT_ARCH_AMD64
384 pushf
385 pop T2
386 %else
387 push T0
388 pushf
389 pop T0
390 %endif
391 mov T1_32, [%1] ; load flags.
392 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
393 %ifdef RT_ARCH_AMD64
394 and T2_32, (%2) ; select the modified flags.
395 or T1_32, T2_32 ; combine the flags.
396 %else
397 and T0_32, (%2) ; select the modified flags.
398 or T1_32, T0_32 ; combine the flags.
399 pop T0
400 %endif
401
402 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
403 bt %4, %5 - 1
404 jnc %%sf_clear
405 or T1_32, X86_EFL_SF
406 %%sf_clear:
407
408 ; Parity last.
409 and %6, 0xff
410 %ifdef RT_ARCH_AMD64
411 lea T2, [NAME(g_afParity) xWrtRIP]
412 or T1_8, [T2 + %6]
413 %else
414 or T1_8, [NAME(g_afParity) + %6]
415 %endif
416
417 mov [%1], T1_32 ; save the result.
418%endmacro
419
420;;
421; Calculates the new EFLAGS using fixed clear and set bit masks.
422;
423; @remarks Clobbers T0.
424; @param 1 The register pointing to the EFLAGS.
425; @param 2 Mask of additional flags to always clear
426; @param 3 Mask of additional flags to always set.
427;
428%macro IEM_ADJUST_FLAGS 3
429 %if (%2 | %3) != 0
430 mov T0_32, [%1] ; Load flags.
431 %if (%2) != 0
432 and T0_32, ~(%2) ; Remove the always cleared flags.
433 %endif
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 mov [%1], T0_32 ; Save the result.
438 %endif
439%endmacro
440
441;;
442; Calculates the new EFLAGS using fixed clear and set bit masks.
443;
444; @remarks Clobbers T0, %4, EFLAGS.
445; @param 1 The register pointing to the EFLAGS.
446; @param 2 Mask of additional flags to always clear
447; @param 3 Mask of additional flags to always set.
448; @param 4 The (full) register containing the parity table index. Will be modified!
449;
450%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
451 mov T0_32, [%1] ; Load flags.
452 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
453 %if (%3) != 0
454 or T0_32, %3 ; Add the always set flags.
455 %endif
456 and %4, 0xff
457 %ifdef RT_ARCH_AMD64
458 lea T2, [NAME(g_afParity) xWrtRIP]
459 or T0_8, [T2 + %4]
460 %else
461 or T0_8, [NAME(g_afParity) + %4]
462 %endif
463 mov [%1], T0_32 ; Save the result.
464%endmacro
465
466
467;;
468; Checks that the size expression %1 matches %2 adjusted according to
469; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
470; @param 1 The jump array size assembly expression.
471; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
472;
473%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
475 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
476 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
477 %else
478 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
479 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
480 %endif
481%endmacro
482
483
484;*********************************************************************************************************************************
485;* External Symbols *
486;*********************************************************************************************************************************
487extern NAME(g_afParity)
488
489
490;;
491; Macro for implementing a binary operator.
492;
493; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
494; variants, except on 32-bit system where the 64-bit accesses requires hand
495; coding.
496;
497; All the functions takes a pointer to the destination memory operand in A0,
498; the source register operand in A1 and a pointer to eflags in A2.
499;
500; @param 1 The instruction mnemonic.
501; @param 2 Non-zero if there should be a locked version.
502; @param 3 The modified flags.
503; @param 4 The undefined flags.
504;
505%macro IEMIMPL_BIN_OP 4
506BEGINCODE
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64
539 %endif ; RT_ARCH_AMD64
540
541 %if %2 != 0 ; locked versions requested?
542
543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
544 PROLOGUE_3_ARGS
545 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
546 lock %1 byte [A0], A1_8
547 IEM_SAVE_FLAGS A2, %3, %4
548 EPILOGUE_3_ARGS
549ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
550
551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
552 PROLOGUE_3_ARGS
553 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
554 lock %1 word [A0], A1_16
555 IEM_SAVE_FLAGS A2, %3, %4
556 EPILOGUE_3_ARGS
557ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
558
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 dword [A0], A1_32
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS
565ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
566
567 %ifdef RT_ARCH_AMD64
568BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
569 PROLOGUE_3_ARGS
570 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
571 lock %1 qword [A0], A1
572 IEM_SAVE_FLAGS A2, %3, %4
573 EPILOGUE_3_ARGS_EX 8
574ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
575 %endif ; RT_ARCH_AMD64
576 %endif ; locked
577%endmacro
578
579; instr,lock, modified-flags, undefined flags
580IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
581IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
582IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
583IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
584IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
585IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
586IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
587IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
588IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
589
590
591;;
592; Macro for implementing a binary operator, VEX variant with separate input/output.
593;
594; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
595; where the 64-bit accesses requires hand coding.
596;
597; All the functions takes a pointer to the destination memory operand in A0,
598; the first source register operand in A1, the second source register operand
599; in A2 and a pointer to eflags in A3.
600;
601; @param 1 The instruction mnemonic.
602; @param 2 The modified flags.
603; @param 3 The undefined flags.
604;
605%macro IEMIMPL_VEX_BIN_OP 3
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
607 PROLOGUE_4_ARGS
608 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
609 %1 T0_32, A1_32, A2_32
610 mov [A0], T0_32
611 IEM_SAVE_FLAGS A3, %2, %3
612 EPILOGUE_4_ARGS
613ENDPROC iemAImpl_ %+ %1 %+ _u32
614
615 %ifdef RT_ARCH_AMD64
616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
617 PROLOGUE_4_ARGS
618 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
619 %1 T0, A1, A2
620 mov [A0], T0
621 IEM_SAVE_FLAGS A3, %2, %3
622 EPILOGUE_4_ARGS
623ENDPROC iemAImpl_ %+ %1 %+ _u64
624 %endif ; RT_ARCH_AMD64
625%endmacro
626
627; instr, modified-flags, undefined-flags
628IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
629IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
630IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
631
632;;
633; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
634;
635; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
636; where the 64-bit accesses requires hand coding.
637;
638; All the functions takes a pointer to the destination memory operand in A0,
639; the source register operand in A1 and a pointer to eflags in A2.
640;
641; @param 1 The instruction mnemonic.
642; @param 2 The modified flags.
643; @param 3 The undefined flags.
644;
645%macro IEMIMPL_VEX_BIN_OP_2 3
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0_32, [A0]
650 %1 T0_32, A1_32
651 mov [A0], T0_32
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u32
655
656 %ifdef RT_ARCH_AMD64
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
658 PROLOGUE_4_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
660 mov T0, [A0]
661 %1 T0, A1
662 mov [A0], T0
663 IEM_SAVE_FLAGS A2, %2, %3
664 EPILOGUE_4_ARGS
665ENDPROC iemAImpl_ %+ %1 %+ _u64
666 %endif ; RT_ARCH_AMD64
667%endmacro
668
669; instr, modified-flags, undefined-flags
670IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
671IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
672IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
673
674
675;;
676; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
677;
678; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
679; where the 64-bit accesses requires hand coding.
680;
681; All the functions takes a pointer to the destination memory operand in A0,
682; the first source register operand in A1, the second source register operand
683; in A2 and a pointer to eflags in A3.
684;
685; @param 1 The instruction mnemonic.
686; @param 2 Fallback instruction if applicable.
687; @param 3 Whether to emit fallback or not.
688;
689%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
691 PROLOGUE_3_ARGS
692 %1 T0_32, A1_32, A2_32
693 mov [A0], T0_32
694 EPILOGUE_3_ARGS
695ENDPROC iemAImpl_ %+ %1 %+ _u32
696
697 %if %3
698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
699 PROLOGUE_3_ARGS
700 %ifdef ASM_CALL64_GCC
701 mov cl, A2_8
702 %2 A1_32, cl
703 mov [A0], A1_32
704 %else
705 xchg A2, A0
706 %2 A1_32, cl
707 mov [A2], A1_32
708 %endif
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
711 %endif
712
713 %ifdef RT_ARCH_AMD64
714BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
715 PROLOGUE_3_ARGS
716 %1 T0, A1, A2
717 mov [A0], T0
718 EPILOGUE_3_ARGS
719ENDPROC iemAImpl_ %+ %1 %+ _u64
720
721 %if %3
722BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
723 PROLOGUE_3_ARGS
724 %ifdef ASM_CALL64_GCC
725 mov cl, A2_8
726 %2 A1, cl
727 mov [A0], A1_32
728 %else
729 xchg A2, A0
730 %2 A1, cl
731 mov [A2], A1_32
732 %endif
733 mov [A0], A1
734 EPILOGUE_3_ARGS
735ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
736 %endif
737 %endif ; RT_ARCH_AMD64
738%endmacro
739
740; instr, fallback instr, emit fallback
741IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
742IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
743IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
744IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
745IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
746
747
748;
749; RORX uses a immediate byte for the shift count, so we only do
750; fallback implementation of that one.
751;
752BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
753 PROLOGUE_3_ARGS
754 %ifdef ASM_CALL64_GCC
755 mov cl, A2_8
756 ror A1_32, cl
757 mov [A0], A1_32
758 %else
759 xchg A2, A0
760 ror A1_32, cl
761 mov [A2], A1_32
762 %endif
763 EPILOGUE_3_ARGS
764ENDPROC iemAImpl_rorx_u32
765
766 %ifdef RT_ARCH_AMD64
767BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
768 PROLOGUE_3_ARGS
769 %ifdef ASM_CALL64_GCC
770 mov cl, A2_8
771 ror A1, cl
772 mov [A0], A1
773 %else
774 xchg A2, A0
775 ror A1, cl
776 mov [A2], A1
777 %endif
778 EPILOGUE_3_ARGS
779ENDPROC iemAImpl_rorx_u64
780 %endif ; RT_ARCH_AMD64
781
782
783;
784; MULX
785;
786BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX - prefect
790 mulx T0_32, T1_32, A3_32
791 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
792 mov [A0], T0_32
793%else
794 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
795 xchg A1, A2
796 mulx T0_32, T1_32, A3_32
797 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
798 mov [A0], T0_32
799%endif
800 EPILOGUE_4_ARGS
801ENDPROC iemAImpl_mulx_u32
802
803
804BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
805 PROLOGUE_4_ARGS
806%ifdef ASM_CALL64_GCC
807 ; A2_32 is EDX, T0_32 is EAX
808 mov eax, A3_32
809 mul A2_32
810 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%else
813 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
814 xchg A1, A2
815 mov eax, A3_32
816 mul A2_32
817 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], edx
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u32_fallback
822
823%ifdef RT_ARCH_AMD64
824BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX - prefect
828 mulx T0, T1, A3
829 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
830 mov [A0], T0
831%else
832 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
833 xchg A1, A2
834 mulx T0, T1, A3
835 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
836 mov [A0], T0
837%endif
838 EPILOGUE_4_ARGS
839ENDPROC iemAImpl_mulx_u64
840
841
842BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
843 PROLOGUE_4_ARGS
844%ifdef ASM_CALL64_GCC
845 ; A2 is RDX, T0 is RAX
846 mov rax, A3
847 mul A2
848 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%else
851 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
852 xchg A1, A2
853 mov rax, A3
854 mul A2
855 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
856 mov [A0], rdx
857%endif
858 EPILOGUE_4_ARGS
859ENDPROC iemAImpl_mulx_u64_fallback
860
861%endif
862
863
864;;
865; Macro for implementing a bit operator.
866;
867; This will generate code for the 16, 32 and 64 bit accesses with locked
868; variants, except on 32-bit system where the 64-bit accesses requires hand
869; coding.
870;
871; All the functions takes a pointer to the destination memory operand in A0,
872; the source register operand in A1 and a pointer to eflags in A2.
873;
874; @param 1 The instruction mnemonic.
875; @param 2 Non-zero if there should be a locked version.
876; @param 3 The modified flags.
877; @param 4 The undefined flags.
878;
879%macro IEMIMPL_BIT_OP 4
880BEGINCODE
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 word [A0], A1_16
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS
887ENDPROC iemAImpl_ %+ %1 %+ _u16
888
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 dword [A0], A1_32
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS
895ENDPROC iemAImpl_ %+ %1 %+ _u32
896
897 %ifdef RT_ARCH_AMD64
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 %1 qword [A0], A1
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS_EX 8
904ENDPROC iemAImpl_ %+ %1 %+ _u64
905 %endif ; RT_ARCH_AMD64
906
907 %if %2 != 0 ; locked versions requested?
908
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 word [A0], A1_16
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS
915ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
916
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 dword [A0], A1_32
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS
923ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
924
925 %ifdef RT_ARCH_AMD64
926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
927 PROLOGUE_3_ARGS
928 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
929 lock %1 qword [A0], A1
930 IEM_SAVE_FLAGS A2, %3, %4
931 EPILOGUE_3_ARGS_EX 8
932ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
933 %endif ; RT_ARCH_AMD64
934 %endif ; locked
935%endmacro
936IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
937IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
938IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
939IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
940
941;;
942; Macro for implementing a bit search operator.
943;
944; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
945; system where the 64-bit accesses requires hand coding.
946;
947; All the functions takes a pointer to the destination memory operand in A0,
948; the source register operand in A1 and a pointer to eflags in A2.
949;
950; In the ZF case the destination register is 'undefined', however it seems that
951; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
952; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
953; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
954; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
955;
956; @param 1 The instruction mnemonic.
957; @param 2 The modified flags.
958; @param 3 The undefined flags.
959; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
960;
961%macro IEMIMPL_BIT_OP2 4
962BEGINCODE
963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
964 PROLOGUE_3_ARGS
965 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
966 %1 T0_16, A1_16
967%if %4 != 0
968 jz .unchanged_dst
969%endif
970 mov [A0], T0_16
971.unchanged_dst:
972 IEM_SAVE_FLAGS A2, %2, %3
973 EPILOGUE_3_ARGS
974ENDPROC iemAImpl_ %+ %1 %+ _u16
975
976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
977 PROLOGUE_3_ARGS
978 %1 T1_16, A1_16
979%if %4 != 0
980 jz .unchanged_dst
981%endif
982 mov [A0], T1_16
983 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
984 EPILOGUE_3_ARGS
985.unchanged_dst:
986 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
987 EPILOGUE_3_ARGS
988ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
991 PROLOGUE_3_ARGS
992 %1 T0_16, A1_16
993%if %4 != 0
994 jz .unchanged_dst
995%endif
996 mov [A0], T0_16
997.unchanged_dst:
998 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
999 EPILOGUE_3_ARGS
1000ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1001
1002
1003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1004 PROLOGUE_3_ARGS
1005 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1006 %1 T0_32, A1_32
1007%if %4 != 0
1008 jz .unchanged_dst
1009%endif
1010 mov [A0], T0_32
1011.unchanged_dst:
1012 IEM_SAVE_FLAGS A2, %2, %3
1013 EPILOGUE_3_ARGS
1014ENDPROC iemAImpl_ %+ %1 %+ _u32
1015
1016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1017 PROLOGUE_3_ARGS
1018 %1 T1_32, A1_32
1019%if %4 != 0
1020 jz .unchanged_dst
1021%endif
1022 mov [A0], T1_32
1023 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1024 EPILOGUE_3_ARGS
1025.unchanged_dst:
1026 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1027 EPILOGUE_3_ARGS
1028ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1029
1030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1031 PROLOGUE_3_ARGS
1032 %1 T0_32, A1_32
1033%if %4 != 0
1034 jz .unchanged_dst
1035%endif
1036 mov [A0], T0_32
1037.unchanged_dst:
1038 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1039 EPILOGUE_3_ARGS
1040ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1041
1042
1043 %ifdef RT_ARCH_AMD64
1044
1045BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1046 PROLOGUE_3_ARGS
1047 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1048 %1 T0, A1
1049%if %4 != 0
1050 jz .unchanged_dst
1051%endif
1052 mov [A0], T0
1053.unchanged_dst:
1054 IEM_SAVE_FLAGS A2, %2, %3
1055 EPILOGUE_3_ARGS_EX 8
1056ENDPROC iemAImpl_ %+ %1 %+ _u64
1057
1058BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1059 PROLOGUE_3_ARGS
1060 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1061 %1 T1, A1
1062%if %4 != 0
1063 jz .unchanged_dst
1064%endif
1065 mov [A0], T1
1066 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1067 EPILOGUE_3_ARGS
1068.unchanged_dst:
1069 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1070 EPILOGUE_3_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1072
1073BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1074 PROLOGUE_3_ARGS
1075 %1 T0, A1
1076%if %4 != 0
1077 jz .unchanged_dst
1078%endif
1079 mov [A0], T0
1080.unchanged_dst:
1081 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1082 EPILOGUE_3_ARGS_EX 8
1083ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1084
1085 %endif ; RT_ARCH_AMD64
1086%endmacro
1087
1088IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1089IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1090IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1091IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1092
1093
1094;;
1095; Macro for implementing POPCNT.
1096;
1097; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1098; system where the 64-bit accesses requires hand coding.
1099;
1100; All the functions takes a pointer to the destination memory operand in A0,
1101; the source register operand in A1 and a pointer to eflags in A2.
1102;
1103; ASSUMES Intel and AMD set EFLAGS the same way.
1104;
1105; ASSUMES the instruction does not support memory destination.
1106;
1107; @param 1 The instruction mnemonic.
1108; @param 2 The modified flags.
1109; @param 3 The undefined flags.
1110;
1111%macro IEMIMPL_BIT_OP3 3
1112BEGINCODE
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_16, A1_16
1117 mov [A0], T0_16
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u16
1121
1122BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1123 PROLOGUE_3_ARGS
1124 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1125 %1 T0_32, A1_32
1126 mov [A0], T0_32
1127 IEM_SAVE_FLAGS A2, %2, %3
1128 EPILOGUE_3_ARGS
1129ENDPROC iemAImpl_ %+ %1 %+ _u32
1130
1131 %ifdef RT_ARCH_AMD64
1132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1133 PROLOGUE_3_ARGS
1134 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1135 %1 T0, A1
1136 mov [A0], T0
1137 IEM_SAVE_FLAGS A2, %2, %3
1138 EPILOGUE_3_ARGS_EX 8
1139ENDPROC iemAImpl_ %+ %1 %+ _u64
1140 %endif ; RT_ARCH_AMD64
1141%endmacro
1142IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1143
1144
1145;
1146; IMUL is also a similar but yet different case (no lock, no mem dst).
1147; The rDX:rAX variant of imul is handled together with mul further down.
1148;
1149BEGINCODE
1150; @param 1 EFLAGS that are modified.
1151; @param 2 Undefined EFLAGS.
1152; @param 3 Function suffix.
1153; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1154; 2 for AMD (set AF, clear PF, ZF and SF).
1155%macro IEMIMPL_IMUL_TWO 4
1156BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1159 imul A1_16, word [A0]
1160 mov [A0], A1_16
1161 %if %4 != 1
1162 IEM_SAVE_FLAGS A2, %1, %2
1163 %else
1164 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1165 %endif
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_imul_two_u16 %+ %3
1168
1169BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1170 PROLOGUE_3_ARGS
1171 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1172 imul A1_32, dword [A0]
1173 mov [A0], A1_32
1174 %if %4 != 1
1175 IEM_SAVE_FLAGS A2, %1, %2
1176 %else
1177 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1178 %endif
1179 EPILOGUE_3_ARGS
1180ENDPROC iemAImpl_imul_two_u32 %+ %3
1181
1182 %ifdef RT_ARCH_AMD64
1183BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1184 PROLOGUE_3_ARGS
1185 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1186 imul A1, qword [A0]
1187 mov [A0], A1
1188 %if %4 != 1
1189 IEM_SAVE_FLAGS A2, %1, %2
1190 %else
1191 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1192 %endif
1193 EPILOGUE_3_ARGS_EX 8
1194ENDPROC iemAImpl_imul_two_u64 %+ %3
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1198IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1199IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1200
1201
1202;
1203; XCHG for memory operands. This implies locking. No flag changes.
1204;
1205; Each function takes two arguments, first the pointer to the memory,
1206; then the pointer to the register. They all return void.
1207;
1208BEGINCODE
1209BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_8, [A1]
1212 xchg [A0], T0_8
1213 mov [A1], T0_8
1214 EPILOGUE_2_ARGS
1215ENDPROC iemAImpl_xchg_u8_locked
1216
1217BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0_16, [A1]
1220 xchg [A0], T0_16
1221 mov [A1], T0_16
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u16_locked
1224
1225BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0_32, [A1]
1228 xchg [A0], T0_32
1229 mov [A1], T0_32
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u32_locked
1232
1233%ifdef RT_ARCH_AMD64
1234BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0, [A1]
1237 xchg [A0], T0
1238 mov [A1], T0
1239 EPILOGUE_2_ARGS
1240ENDPROC iemAImpl_xchg_u64_locked
1241%endif
1242
1243; Unlocked variants for fDisregardLock mode.
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_8, [A1]
1248 mov T1_8, [A0]
1249 mov [A0], T0_8
1250 mov [A1], T1_8
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u8_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_16, [A1]
1257 mov T1_16, [A0]
1258 mov [A0], T0_16
1259 mov [A1], T1_16
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u16_unlocked
1262
1263BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1264 PROLOGUE_2_ARGS
1265 mov T0_32, [A1]
1266 mov T1_32, [A0]
1267 mov [A0], T0_32
1268 mov [A1], T1_32
1269 EPILOGUE_2_ARGS
1270ENDPROC iemAImpl_xchg_u32_unlocked
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1274 PROLOGUE_2_ARGS
1275 mov T0, [A1]
1276 mov T1, [A0]
1277 mov [A0], T0
1278 mov [A1], T1
1279 EPILOGUE_2_ARGS
1280ENDPROC iemAImpl_xchg_u64_unlocked
1281%endif
1282
1283
1284;
1285; XADD for memory operands.
1286;
1287; Each function takes three arguments, first the pointer to the
1288; memory/register, then the pointer to the register, and finally a pointer to
1289; eflags. They all return void.
1290;
1291BEGINCODE
1292BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1293 PROLOGUE_3_ARGS
1294 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 mov T0_8, [A1]
1296 xadd [A0], T0_8
1297 mov [A1], T0_8
1298 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1299 EPILOGUE_3_ARGS
1300ENDPROC iemAImpl_xadd_u8
1301
1302BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1303 PROLOGUE_3_ARGS
1304 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1305 mov T0_16, [A1]
1306 xadd [A0], T0_16
1307 mov [A1], T0_16
1308 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 EPILOGUE_3_ARGS
1310ENDPROC iemAImpl_xadd_u16
1311
1312BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0_32, [A1]
1316 xadd [A0], T0_32
1317 mov [A1], T0_32
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u32
1321
1322%ifdef RT_ARCH_AMD64
1323BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0, [A1]
1327 xadd [A0], T0
1328 mov [A1], T0
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u64
1332%endif ; RT_ARCH_AMD64
1333
1334BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1335 PROLOGUE_3_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 mov T0_8, [A1]
1338 lock xadd [A0], T0_8
1339 mov [A1], T0_8
1340 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1341 EPILOGUE_3_ARGS
1342ENDPROC iemAImpl_xadd_u8_locked
1343
1344BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1345 PROLOGUE_3_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1347 mov T0_16, [A1]
1348 lock xadd [A0], T0_16
1349 mov [A1], T0_16
1350 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 EPILOGUE_3_ARGS
1352ENDPROC iemAImpl_xadd_u16_locked
1353
1354BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0_32, [A1]
1358 lock xadd [A0], T0_32
1359 mov [A1], T0_32
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u32_locked
1363
1364%ifdef RT_ARCH_AMD64
1365BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1368 mov T0, [A1]
1369 lock xadd [A0], T0
1370 mov [A1], T0
1371 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_xadd_u64_locked
1374%endif ; RT_ARCH_AMD64
1375
1376
1377;
1378; CMPXCHG8B.
1379;
1380; These are tricky register wise, so the code is duplicated for each calling
1381; convention.
1382;
1383; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1384;
1385; C-proto:
1386; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1387; uint32_t *pEFlags));
1388;
1389; Note! Identical to iemAImpl_cmpxchg16b.
1390;
1391BEGINCODE
1392BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1393%ifdef RT_ARCH_AMD64
1394 %ifdef ASM_CALL64_MSC
1395 push rbx
1396
1397 mov r11, rdx ; pu64EaxEdx (is also T1)
1398 mov r10, rcx ; pu64Dst
1399
1400 mov ebx, [r8]
1401 mov ecx, [r8 + 4]
1402 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1403 mov eax, [r11]
1404 mov edx, [r11 + 4]
1405
1406 cmpxchg8b [r10]
1407
1408 mov [r11], eax
1409 mov [r11 + 4], edx
1410 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1411
1412 pop rbx
1413 ret
1414 %else
1415 push rbx
1416
1417 mov r10, rcx ; pEFlags
1418 mov r11, rdx ; pu64EbxEcx (is also T1)
1419
1420 mov ebx, [r11]
1421 mov ecx, [r11 + 4]
1422 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1423 mov eax, [rsi]
1424 mov edx, [rsi + 4]
1425
1426 cmpxchg8b [rdi]
1427
1428 mov [rsi], eax
1429 mov [rsi + 4], edx
1430 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1431
1432 pop rbx
1433 ret
1434
1435 %endif
1436%else
1437 push esi
1438 push edi
1439 push ebx
1440 push ebp
1441
1442 mov edi, ecx ; pu64Dst
1443 mov esi, edx ; pu64EaxEdx
1444 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1445 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1446
1447 mov ebx, [ecx]
1448 mov ecx, [ecx + 4]
1449 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1450 mov eax, [esi]
1451 mov edx, [esi + 4]
1452
1453 cmpxchg8b [edi]
1454
1455 mov [esi], eax
1456 mov [esi + 4], edx
1457 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1458
1459 pop ebp
1460 pop ebx
1461 pop edi
1462 pop esi
1463 ret 8
1464%endif
1465ENDPROC iemAImpl_cmpxchg8b
1466
1467BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1468%ifdef RT_ARCH_AMD64
1469 %ifdef ASM_CALL64_MSC
1470 push rbx
1471
1472 mov r11, rdx ; pu64EaxEdx (is also T1)
1473 mov r10, rcx ; pu64Dst
1474
1475 mov ebx, [r8]
1476 mov ecx, [r8 + 4]
1477 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1478 mov eax, [r11]
1479 mov edx, [r11 + 4]
1480
1481 lock cmpxchg8b [r10]
1482
1483 mov [r11], eax
1484 mov [r11 + 4], edx
1485 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1486
1487 pop rbx
1488 ret
1489 %else
1490 push rbx
1491
1492 mov r10, rcx ; pEFlags
1493 mov r11, rdx ; pu64EbxEcx (is also T1)
1494
1495 mov ebx, [r11]
1496 mov ecx, [r11 + 4]
1497 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1498 mov eax, [rsi]
1499 mov edx, [rsi + 4]
1500
1501 lock cmpxchg8b [rdi]
1502
1503 mov [rsi], eax
1504 mov [rsi + 4], edx
1505 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1506
1507 pop rbx
1508 ret
1509
1510 %endif
1511%else
1512 push esi
1513 push edi
1514 push ebx
1515 push ebp
1516
1517 mov edi, ecx ; pu64Dst
1518 mov esi, edx ; pu64EaxEdx
1519 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1520 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1521
1522 mov ebx, [ecx]
1523 mov ecx, [ecx + 4]
1524 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1525 mov eax, [esi]
1526 mov edx, [esi + 4]
1527
1528 lock cmpxchg8b [edi]
1529
1530 mov [esi], eax
1531 mov [esi + 4], edx
1532 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1533
1534 pop ebp
1535 pop ebx
1536 pop edi
1537 pop esi
1538 ret 8
1539%endif
1540ENDPROC iemAImpl_cmpxchg8b_locked
1541
1542%ifdef RT_ARCH_AMD64
1543
1544;
1545; CMPXCHG16B.
1546;
1547; These are tricky register wise, so the code is duplicated for each calling
1548; convention.
1549;
1550; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1551;
1552; C-proto:
1553; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1554; uint32_t *pEFlags));
1555;
1556; Note! Identical to iemAImpl_cmpxchg8b.
1557;
1558BEGINCODE
1559BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1560 %ifdef ASM_CALL64_MSC
1561 push rbx
1562
1563 mov r11, rdx ; pu64RaxRdx (is also T1)
1564 mov r10, rcx ; pu64Dst
1565
1566 mov rbx, [r8]
1567 mov rcx, [r8 + 8]
1568 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1569 mov rax, [r11]
1570 mov rdx, [r11 + 8]
1571
1572 cmpxchg16b [r10]
1573
1574 mov [r11], rax
1575 mov [r11 + 8], rdx
1576 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1577
1578 pop rbx
1579 ret
1580 %else
1581 push rbx
1582
1583 mov r10, rcx ; pEFlags
1584 mov r11, rdx ; pu64RbxRcx (is also T1)
1585
1586 mov rbx, [r11]
1587 mov rcx, [r11 + 8]
1588 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1589 mov rax, [rsi]
1590 mov rdx, [rsi + 8]
1591
1592 cmpxchg16b [rdi]
1593
1594 mov [rsi], rax
1595 mov [rsi + 8], rdx
1596 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1597
1598 pop rbx
1599 ret
1600
1601 %endif
1602ENDPROC iemAImpl_cmpxchg16b
1603
1604BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1605 %ifdef ASM_CALL64_MSC
1606 push rbx
1607
1608 mov r11, rdx ; pu64RaxRdx (is also T1)
1609 mov r10, rcx ; pu64Dst
1610
1611 mov rbx, [r8]
1612 mov rcx, [r8 + 8]
1613 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1614 mov rax, [r11]
1615 mov rdx, [r11 + 8]
1616
1617 lock cmpxchg16b [r10]
1618
1619 mov [r11], rax
1620 mov [r11 + 8], rdx
1621 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1622
1623 pop rbx
1624 ret
1625 %else
1626 push rbx
1627
1628 mov r10, rcx ; pEFlags
1629 mov r11, rdx ; pu64RbxRcx (is also T1)
1630
1631 mov rbx, [r11]
1632 mov rcx, [r11 + 8]
1633 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1634 mov rax, [rsi]
1635 mov rdx, [rsi + 8]
1636
1637 lock cmpxchg16b [rdi]
1638
1639 mov [rsi], rax
1640 mov [rsi + 8], rdx
1641 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1642
1643 pop rbx
1644 ret
1645
1646 %endif
1647ENDPROC iemAImpl_cmpxchg16b_locked
1648
1649%endif ; RT_ARCH_AMD64
1650
1651
1652;
1653; CMPXCHG.
1654;
1655; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1656;
1657; C-proto:
1658; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1659;
1660BEGINCODE
1661%macro IEMIMPL_CMPXCHG 2
1662BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1663 PROLOGUE_4_ARGS
1664 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1665 mov al, [A1]
1666 %1 cmpxchg [A0], A2_8
1667 mov [A1], al
1668 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1669 EPILOGUE_4_ARGS
1670ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1671
1672BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1673 PROLOGUE_4_ARGS
1674 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1675 mov ax, [A1]
1676 %1 cmpxchg [A0], A2_16
1677 mov [A1], ax
1678 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1679 EPILOGUE_4_ARGS
1680ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1681
1682BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1683 PROLOGUE_4_ARGS
1684 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1685 mov eax, [A1]
1686 %1 cmpxchg [A0], A2_32
1687 mov [A1], eax
1688 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1689 EPILOGUE_4_ARGS
1690ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1691
1692BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1693%ifdef RT_ARCH_AMD64
1694 PROLOGUE_4_ARGS
1695 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1696 mov rax, [A1]
1697 %1 cmpxchg [A0], A2
1698 mov [A1], rax
1699 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1700 EPILOGUE_4_ARGS
1701%else
1702 ;
1703 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1704 ;
1705 push esi
1706 push edi
1707 push ebx
1708 push ebp
1709
1710 mov edi, ecx ; pu64Dst
1711 mov esi, edx ; pu64Rax
1712 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1713 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1714
1715 mov ebx, [ecx]
1716 mov ecx, [ecx + 4]
1717 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1718 mov eax, [esi]
1719 mov edx, [esi + 4]
1720
1721 lock cmpxchg8b [edi]
1722
1723 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1724 jz .cmpxchg8b_not_equal
1725 cmp eax, eax ; just set the other flags.
1726.store:
1727 mov [esi], eax
1728 mov [esi + 4], edx
1729 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1730
1731 pop ebp
1732 pop ebx
1733 pop edi
1734 pop esi
1735 ret 8
1736
1737.cmpxchg8b_not_equal:
1738 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1739 jne .store
1740 cmp [esi], eax
1741 jmp .store
1742
1743%endif
1744ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1745%endmacro ; IEMIMPL_CMPXCHG
1746
1747IEMIMPL_CMPXCHG , ,
1748IEMIMPL_CMPXCHG lock, _locked
1749
1750;;
1751; Macro for implementing a unary operator.
1752;
1753; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1754; variants, except on 32-bit system where the 64-bit accesses requires hand
1755; coding.
1756;
1757; All the functions takes a pointer to the destination memory operand in A0,
1758; the source register operand in A1 and a pointer to eflags in A2.
1759;
1760; @param 1 The instruction mnemonic.
1761; @param 2 The modified flags.
1762; @param 3 The undefined flags.
1763;
1764%macro IEMIMPL_UNARY_OP 3
1765BEGINCODE
1766BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1767 PROLOGUE_2_ARGS
1768 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1769 %1 byte [A0]
1770 IEM_SAVE_FLAGS A1, %2, %3
1771 EPILOGUE_2_ARGS
1772ENDPROC iemAImpl_ %+ %1 %+ _u8
1773
1774BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1775 PROLOGUE_2_ARGS
1776 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1777 lock %1 byte [A0]
1778 IEM_SAVE_FLAGS A1, %2, %3
1779 EPILOGUE_2_ARGS
1780ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1781
1782BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1783 PROLOGUE_2_ARGS
1784 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1785 %1 word [A0]
1786 IEM_SAVE_FLAGS A1, %2, %3
1787 EPILOGUE_2_ARGS
1788ENDPROC iemAImpl_ %+ %1 %+ _u16
1789
1790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1791 PROLOGUE_2_ARGS
1792 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1793 lock %1 word [A0]
1794 IEM_SAVE_FLAGS A1, %2, %3
1795 EPILOGUE_2_ARGS
1796ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1797
1798BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1799 PROLOGUE_2_ARGS
1800 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1801 %1 dword [A0]
1802 IEM_SAVE_FLAGS A1, %2, %3
1803 EPILOGUE_2_ARGS
1804ENDPROC iemAImpl_ %+ %1 %+ _u32
1805
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1807 PROLOGUE_2_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1809 lock %1 dword [A0]
1810 IEM_SAVE_FLAGS A1, %2, %3
1811 EPILOGUE_2_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1813
1814 %ifdef RT_ARCH_AMD64
1815BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1816 PROLOGUE_2_ARGS
1817 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1818 %1 qword [A0]
1819 IEM_SAVE_FLAGS A1, %2, %3
1820 EPILOGUE_2_ARGS
1821ENDPROC iemAImpl_ %+ %1 %+ _u64
1822
1823BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1824 PROLOGUE_2_ARGS
1825 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1826 lock %1 qword [A0]
1827 IEM_SAVE_FLAGS A1, %2, %3
1828 EPILOGUE_2_ARGS
1829ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1830 %endif ; RT_ARCH_AMD64
1831
1832%endmacro
1833
1834IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1835IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1836IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1837IEMIMPL_UNARY_OP not, 0, 0
1838
1839
1840;
1841; BSWAP. No flag changes.
1842;
1843; Each function takes one argument, pointer to the value to bswap
1844; (input/output). They all return void.
1845;
1846BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1847 PROLOGUE_1_ARGS
1848 mov T0_32, [A0] ; just in case any of the upper bits are used.
1849 db 66h
1850 bswap T0_32
1851 mov [A0], T0_32
1852 EPILOGUE_1_ARGS
1853ENDPROC iemAImpl_bswap_u16
1854
1855BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1856 PROLOGUE_1_ARGS
1857 mov T0_32, [A0]
1858 bswap T0_32
1859 mov [A0], T0_32
1860 EPILOGUE_1_ARGS
1861ENDPROC iemAImpl_bswap_u32
1862
1863BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1864%ifdef RT_ARCH_AMD64
1865 PROLOGUE_1_ARGS
1866 mov T0, [A0]
1867 bswap T0
1868 mov [A0], T0
1869 EPILOGUE_1_ARGS
1870%else
1871 PROLOGUE_1_ARGS
1872 mov T0, [A0]
1873 mov T1, [A0 + 4]
1874 bswap T0
1875 bswap T1
1876 mov [A0 + 4], T0
1877 mov [A0], T1
1878 EPILOGUE_1_ARGS
1879%endif
1880ENDPROC iemAImpl_bswap_u64
1881
1882
1883;;
1884; Macro for implementing a shift operation.
1885;
1886; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1887; 32-bit system where the 64-bit accesses requires hand coding.
1888;
1889; All the functions takes a pointer to the destination memory operand in A0,
1890; the shift count in A1 and a pointer to eflags in A2.
1891;
1892; @param 1 The instruction mnemonic.
1893; @param 2 The modified flags.
1894; @param 3 The undefined flags.
1895;
1896; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1897;
1898; @note the _intel and _amd variants are implemented in C.
1899;
1900%macro IEMIMPL_SHIFT_OP 3
1901BEGINCODE
1902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1903 PROLOGUE_3_ARGS
1904 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1905 %ifdef ASM_CALL64_GCC
1906 mov cl, A1_8
1907 %1 byte [A0], cl
1908 %else
1909 xchg A1, A0
1910 %1 byte [A1], cl
1911 %endif
1912 IEM_SAVE_FLAGS A2, %2, %3
1913 EPILOGUE_3_ARGS
1914ENDPROC iemAImpl_ %+ %1 %+ _u8
1915
1916BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1917 PROLOGUE_3_ARGS
1918 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1919 %ifdef ASM_CALL64_GCC
1920 mov cl, A1_8
1921 %1 word [A0], cl
1922 %else
1923 xchg A1, A0
1924 %1 word [A1], cl
1925 %endif
1926 IEM_SAVE_FLAGS A2, %2, %3
1927 EPILOGUE_3_ARGS
1928ENDPROC iemAImpl_ %+ %1 %+ _u16
1929
1930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1931 PROLOGUE_3_ARGS
1932 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1933 %ifdef ASM_CALL64_GCC
1934 mov cl, A1_8
1935 %1 dword [A0], cl
1936 %else
1937 xchg A1, A0
1938 %1 dword [A1], cl
1939 %endif
1940 IEM_SAVE_FLAGS A2, %2, %3
1941 EPILOGUE_3_ARGS
1942ENDPROC iemAImpl_ %+ %1 %+ _u32
1943
1944 %ifdef RT_ARCH_AMD64
1945BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1946 PROLOGUE_3_ARGS
1947 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1948 %ifdef ASM_CALL64_GCC
1949 mov cl, A1_8
1950 %1 qword [A0], cl
1951 %else
1952 xchg A1, A0
1953 %1 qword [A1], cl
1954 %endif
1955 IEM_SAVE_FLAGS A2, %2, %3
1956 EPILOGUE_3_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u64
1958 %endif ; RT_ARCH_AMD64
1959
1960%endmacro
1961
1962IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1963IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1964IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1965IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1966IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1967IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1968IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1969
1970
1971;;
1972; Macro for implementing a double precision shift operation.
1973;
1974; This will generate code for the 16, 32 and 64 bit accesses, except on
1975; 32-bit system where the 64-bit accesses requires hand coding.
1976;
1977; The functions takes the destination operand (r/m) in A0, the source (reg) in
1978; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1979;
1980; @param 1 The instruction mnemonic.
1981; @param 2 The modified flags.
1982; @param 3 The undefined flags.
1983;
1984; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1985;
1986; @note the _intel and _amd variants are implemented in C.
1987;
1988%macro IEMIMPL_SHIFT_DBL_OP 3
1989BEGINCODE
1990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1991 PROLOGUE_4_ARGS
1992 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1993 %ifdef ASM_CALL64_GCC
1994 xchg A3, A2
1995 %1 [A0], A1_16, cl
1996 xchg A3, A2
1997 %else
1998 xchg A0, A2
1999 %1 [A2], A1_16, cl
2000 %endif
2001 IEM_SAVE_FLAGS A3, %2, %3
2002 EPILOGUE_4_ARGS
2003ENDPROC iemAImpl_ %+ %1 %+ _u16
2004
2005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2006 PROLOGUE_4_ARGS
2007 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2008 %ifdef ASM_CALL64_GCC
2009 xchg A3, A2
2010 %1 [A0], A1_32, cl
2011 xchg A3, A2
2012 %else
2013 xchg A0, A2
2014 %1 [A2], A1_32, cl
2015 %endif
2016 IEM_SAVE_FLAGS A3, %2, %3
2017 EPILOGUE_4_ARGS
2018ENDPROC iemAImpl_ %+ %1 %+ _u32
2019
2020 %ifdef RT_ARCH_AMD64
2021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2022 PROLOGUE_4_ARGS
2023 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2024 %ifdef ASM_CALL64_GCC
2025 xchg A3, A2
2026 %1 [A0], A1, cl
2027 xchg A3, A2
2028 %else
2029 xchg A0, A2
2030 %1 [A2], A1, cl
2031 %endif
2032 IEM_SAVE_FLAGS A3, %2, %3
2033 EPILOGUE_4_ARGS_EX 12
2034ENDPROC iemAImpl_ %+ %1 %+ _u64
2035 %endif ; RT_ARCH_AMD64
2036
2037%endmacro
2038
2039IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2040IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2041
2042
2043;;
2044; Macro for implementing a multiplication operations.
2045;
2046; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2047; 32-bit system where the 64-bit accesses requires hand coding.
2048;
2049; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2050; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2051; pointer to eflags in A3.
2052;
2053; The functions all return 0 so the caller can be used for div/idiv as well as
2054; for the mul/imul implementation.
2055;
2056; @param 1 The instruction mnemonic.
2057; @param 2 The modified flags.
2058; @param 3 The undefined flags.
2059; @param 4 Name suffix.
2060; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2061;
2062; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2063;
2064%macro IEMIMPL_MUL_OP 5
2065BEGINCODE
2066BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2067 PROLOGUE_3_ARGS
2068 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2069 mov al, [A0]
2070 %1 A1_8
2071 mov [A0], ax
2072 %if %5 != 1
2073 IEM_SAVE_FLAGS A2, %2, %3
2074 %else
2075 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2076 %endif
2077 xor eax, eax
2078 EPILOGUE_3_ARGS
2079ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2080
2081BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2082 PROLOGUE_4_ARGS
2083 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2084 mov ax, [A0]
2085 %ifdef ASM_CALL64_GCC
2086 %1 A2_16
2087 mov [A0], ax
2088 mov [A1], dx
2089 %else
2090 mov T1, A1
2091 %1 A2_16
2092 mov [A0], ax
2093 mov [T1], dx
2094 %endif
2095 %if %5 != 1
2096 IEM_SAVE_FLAGS A3, %2, %3
2097 %else
2098 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2099 %endif
2100 xor eax, eax
2101 EPILOGUE_4_ARGS
2102ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2103
2104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2105 PROLOGUE_4_ARGS
2106 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2107 mov eax, [A0]
2108 %ifdef ASM_CALL64_GCC
2109 %1 A2_32
2110 mov [A0], eax
2111 mov [A1], edx
2112 %else
2113 mov T1, A1
2114 %1 A2_32
2115 mov [A0], eax
2116 mov [T1], edx
2117 %endif
2118 %if %5 != 1
2119 IEM_SAVE_FLAGS A3, %2, %3
2120 %else
2121 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2122 %endif
2123 xor eax, eax
2124 EPILOGUE_4_ARGS
2125ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2126
2127 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2128BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2129 PROLOGUE_4_ARGS
2130 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2131 mov rax, [A0]
2132 %ifdef ASM_CALL64_GCC
2133 %1 A2
2134 mov [A0], rax
2135 mov [A1], rdx
2136 %else
2137 mov T1, A1
2138 %1 A2
2139 mov [A0], rax
2140 mov [T1], rdx
2141 %endif
2142 %if %5 != 1
2143 IEM_SAVE_FLAGS A3, %2, %3
2144 %else
2145 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2146 %endif
2147 xor eax, eax
2148 EPILOGUE_4_ARGS_EX 12
2149ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2150 %endif ; !RT_ARCH_AMD64
2151
2152%endmacro
2153
2154IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2158IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2159IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2160
2161
2162BEGINCODE
2163;;
2164; Worker function for negating a 32-bit number in T1:T0
2165; @uses None (T0,T1)
2166BEGINPROC iemAImpl_negate_T0_T1_u32
2167 push 0
2168 push 0
2169 xchg T0_32, [xSP]
2170 xchg T1_32, [xSP + xCB]
2171 sub T0_32, [xSP]
2172 sbb T1_32, [xSP + xCB]
2173 add xSP, xCB*2
2174 ret
2175ENDPROC iemAImpl_negate_T0_T1_u32
2176
2177%ifdef RT_ARCH_AMD64
2178;;
2179; Worker function for negating a 64-bit number in T1:T0
2180; @uses None (T0,T1)
2181BEGINPROC iemAImpl_negate_T0_T1_u64
2182 push 0
2183 push 0
2184 xchg T0, [xSP]
2185 xchg T1, [xSP + xCB]
2186 sub T0, [xSP]
2187 sbb T1, [xSP + xCB]
2188 add xSP, xCB*2
2189 ret
2190ENDPROC iemAImpl_negate_T0_T1_u64
2191%endif
2192
2193
2194;;
2195; Macro for implementing a division operations.
2196;
2197; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2198; 32-bit system where the 64-bit accesses requires hand coding.
2199;
2200; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2201; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2202; pointer to eflags in A3.
2203;
2204; The functions all return 0 on success and -1 if a divide error should be
2205; raised by the caller.
2206;
2207; @param 1 The instruction mnemonic.
2208; @param 2 The modified flags.
2209; @param 3 The undefined flags.
2210; @param 4 1 if signed, 0 if unsigned.
2211; @param 5 Function suffix.
2212; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2213; 2 for AMD (set AF, clear PF, ZF and SF).
2214;
2215; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2216;
2217%macro IEMIMPL_DIV_OP 6
2218BEGINCODE
2219BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2220 PROLOGUE_3_ARGS
2221
2222 ; div by chainsaw check.
2223 test A1_8, A1_8
2224 jz .div_zero
2225
2226 ; Overflow check - unsigned division is simple to verify, haven't
2227 ; found a simple way to check signed division yet unfortunately.
2228 %if %4 == 0
2229 cmp [A0 + 1], A1_8
2230 jae .div_overflow
2231 %else
2232 mov T0_16, [A0] ; T0 = dividend
2233 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2234 test A1_8, A1_8
2235 js .divisor_negative
2236 test T0_16, T0_16
2237 jns .both_positive
2238 neg T0_16
2239.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2240 push T0 ; Start off like unsigned below.
2241 shr T0_16, 7
2242 cmp T0_8, A1_8
2243 pop T0
2244 jb .div_no_overflow
2245 ja .div_overflow
2246 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2247 cmp T0_8, A1_8
2248 jae .div_overflow
2249 jmp .div_no_overflow
2250
2251.divisor_negative:
2252 neg A1_8
2253 test T0_16, T0_16
2254 jns .one_of_each
2255 neg T0_16
2256.both_positive: ; Same as unsigned shifted by sign indicator bit.
2257 shr T0_16, 7
2258 cmp T0_8, A1_8
2259 jae .div_overflow
2260.div_no_overflow:
2261 mov A1, T1 ; restore divisor
2262 %endif
2263
2264 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2265 mov ax, [A0]
2266 %1 A1_8
2267 mov [A0], ax
2268 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2269 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2270 %else
2271 IEM_SAVE_FLAGS A2, %2, %3
2272 %endif
2273 xor eax, eax
2274
2275.return:
2276 EPILOGUE_3_ARGS
2277
2278.div_zero:
2279.div_overflow:
2280 mov eax, -1
2281 jmp .return
2282ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2283
2284BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2285 PROLOGUE_4_ARGS
2286
2287 ; div by chainsaw check.
2288 test A2_16, A2_16
2289 jz .div_zero
2290
2291 ; Overflow check - unsigned division is simple to verify, haven't
2292 ; found a simple way to check signed division yet unfortunately.
2293 %if %4 == 0
2294 cmp [A1], A2_16
2295 jae .div_overflow
2296 %else
2297 mov T0_16, [A1]
2298 shl T0_32, 16
2299 mov T0_16, [A0] ; T0 = dividend
2300 mov T1, A2 ; T1 = divisor
2301 test T1_16, T1_16
2302 js .divisor_negative
2303 test T0_32, T0_32
2304 jns .both_positive
2305 neg T0_32
2306.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2307 push T0 ; Start off like unsigned below.
2308 shr T0_32, 15
2309 cmp T0_16, T1_16
2310 pop T0
2311 jb .div_no_overflow
2312 ja .div_overflow
2313 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2314 cmp T0_16, T1_16
2315 jae .div_overflow
2316 jmp .div_no_overflow
2317
2318.divisor_negative:
2319 neg T1_16
2320 test T0_32, T0_32
2321 jns .one_of_each
2322 neg T0_32
2323.both_positive: ; Same as unsigned shifted by sign indicator bit.
2324 shr T0_32, 15
2325 cmp T0_16, T1_16
2326 jae .div_overflow
2327.div_no_overflow:
2328 %endif
2329
2330 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2331 %ifdef ASM_CALL64_GCC
2332 mov T1, A2
2333 mov ax, [A0]
2334 mov dx, [A1]
2335 %1 T1_16
2336 mov [A0], ax
2337 mov [A1], dx
2338 %else
2339 mov T1, A1
2340 mov ax, [A0]
2341 mov dx, [T1]
2342 %1 A2_16
2343 mov [A0], ax
2344 mov [T1], dx
2345 %endif
2346 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2347 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2348 %else
2349 IEM_SAVE_FLAGS A3, %2, %3
2350 %endif
2351 xor eax, eax
2352
2353.return:
2354 EPILOGUE_4_ARGS
2355
2356.div_zero:
2357.div_overflow:
2358 mov eax, -1
2359 jmp .return
2360ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2361
2362BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2363 PROLOGUE_4_ARGS
2364
2365 ; div by chainsaw check.
2366 test A2_32, A2_32
2367 jz .div_zero
2368
2369 ; Overflow check - unsigned division is simple to verify, haven't
2370 ; found a simple way to check signed division yet unfortunately.
2371 %if %4 == 0
2372 cmp [A1], A2_32
2373 jae .div_overflow
2374 %else
2375 push A2 ; save A2 so we modify it (we out of regs on x86).
2376 mov T0_32, [A0] ; T0 = dividend low
2377 mov T1_32, [A1] ; T1 = dividend high
2378 test A2_32, A2_32
2379 js .divisor_negative
2380 test T1_32, T1_32
2381 jns .both_positive
2382 call NAME(iemAImpl_negate_T0_T1_u32)
2383.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2384 push T0 ; Start off like unsigned below.
2385 shl T1_32, 1
2386 shr T0_32, 31
2387 or T1_32, T0_32
2388 cmp T1_32, A2_32
2389 pop T0
2390 jb .div_no_overflow
2391 ja .div_overflow
2392 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2393 cmp T0_32, A2_32
2394 jae .div_overflow
2395 jmp .div_no_overflow
2396
2397.divisor_negative:
2398 neg A2_32
2399 test T1_32, T1_32
2400 jns .one_of_each
2401 call NAME(iemAImpl_negate_T0_T1_u32)
2402.both_positive: ; Same as unsigned shifted by sign indicator bit.
2403 shl T1_32, 1
2404 shr T0_32, 31
2405 or T1_32, T0_32
2406 cmp T1_32, A2_32
2407 jae .div_overflow
2408.div_no_overflow:
2409 pop A2
2410 %endif
2411
2412 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2413 mov eax, [A0]
2414 %ifdef ASM_CALL64_GCC
2415 mov T1, A2
2416 mov eax, [A0]
2417 mov edx, [A1]
2418 %1 T1_32
2419 mov [A0], eax
2420 mov [A1], edx
2421 %else
2422 mov T1, A1
2423 mov eax, [A0]
2424 mov edx, [T1]
2425 %1 A2_32
2426 mov [A0], eax
2427 mov [T1], edx
2428 %endif
2429 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2430 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2431 %else
2432 IEM_SAVE_FLAGS A3, %2, %3
2433 %endif
2434 xor eax, eax
2435
2436.return:
2437 EPILOGUE_4_ARGS
2438
2439.div_overflow:
2440 %if %4 != 0
2441 pop A2
2442 %endif
2443.div_zero:
2444 mov eax, -1
2445 jmp .return
2446ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2447
2448 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2449BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2450 PROLOGUE_4_ARGS
2451
2452 test A2, A2
2453 jz .div_zero
2454 %if %4 == 0
2455 cmp [A1], A2
2456 jae .div_overflow
2457 %else
2458 push A2 ; save A2 so we modify it (we out of regs on x86).
2459 mov T0, [A0] ; T0 = dividend low
2460 mov T1, [A1] ; T1 = dividend high
2461 test A2, A2
2462 js .divisor_negative
2463 test T1, T1
2464 jns .both_positive
2465 call NAME(iemAImpl_negate_T0_T1_u64)
2466.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2467 push T0 ; Start off like unsigned below.
2468 shl T1, 1
2469 shr T0, 63
2470 or T1, T0
2471 cmp T1, A2
2472 pop T0
2473 jb .div_no_overflow
2474 ja .div_overflow
2475 mov T1, 0x7fffffffffffffff
2476 and T0, T1 ; Special case for covering (divisor - 1).
2477 cmp T0, A2
2478 jae .div_overflow
2479 jmp .div_no_overflow
2480
2481.divisor_negative:
2482 neg A2
2483 test T1, T1
2484 jns .one_of_each
2485 call NAME(iemAImpl_negate_T0_T1_u64)
2486.both_positive: ; Same as unsigned shifted by sign indicator bit.
2487 shl T1, 1
2488 shr T0, 63
2489 or T1, T0
2490 cmp T1, A2
2491 jae .div_overflow
2492.div_no_overflow:
2493 pop A2
2494 %endif
2495
2496 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2497 mov rax, [A0]
2498 %ifdef ASM_CALL64_GCC
2499 mov T1, A2
2500 mov rax, [A0]
2501 mov rdx, [A1]
2502 %1 T1
2503 mov [A0], rax
2504 mov [A1], rdx
2505 %else
2506 mov T1, A1
2507 mov rax, [A0]
2508 mov rdx, [T1]
2509 %1 A2
2510 mov [A0], rax
2511 mov [T1], rdx
2512 %endif
2513 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2514 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2515 %else
2516 IEM_SAVE_FLAGS A3, %2, %3
2517 %endif
2518 xor eax, eax
2519
2520.return:
2521 EPILOGUE_4_ARGS_EX 12
2522
2523.div_overflow:
2524 %if %4 != 0
2525 pop A2
2526 %endif
2527.div_zero:
2528 mov eax, -1
2529 jmp .return
2530ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2531 %endif ; !RT_ARCH_AMD64
2532
2533%endmacro
2534
2535IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2536IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2537IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2538IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2539IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2540IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2541
2542
2543;;
2544; Macro for implementing memory fence operation.
2545;
2546; No return value, no operands or anything.
2547;
2548; @param 1 The instruction.
2549;
2550%macro IEMIMPL_MEM_FENCE 1
2551BEGINCODE
2552BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2553 %1
2554 ret
2555ENDPROC iemAImpl_ %+ %1
2556%endmacro
2557
2558IEMIMPL_MEM_FENCE lfence
2559IEMIMPL_MEM_FENCE sfence
2560IEMIMPL_MEM_FENCE mfence
2561
2562;;
2563; Alternative for non-SSE2 host.
2564;
2565BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2566 push xAX
2567 xchg xAX, [xSP]
2568 add xSP, xCB
2569 ret
2570ENDPROC iemAImpl_alt_mem_fence
2571
2572
2573;;
2574; Initialize the FPU for the actual instruction being emulated, this means
2575; loading parts of the guest's control word and status word.
2576;
2577; @uses 24 bytes of stack. T0, T1
2578; @param 1 Expression giving the address of the FXSTATE of the guest.
2579;
2580%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2581 fnstenv [xSP]
2582
2583 ; FCW - for exception, precision and rounding control.
2584 movzx T0, word [%1 + X86FXSTATE.FCW]
2585 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2586 mov [xSP + X86FSTENV32P.FCW], T0_16
2587
2588 ; FSW - for undefined C0, C1, C2, and C3.
2589 movzx T1, word [%1 + X86FXSTATE.FSW]
2590 and T1, X86_FSW_C_MASK
2591 movzx T0, word [xSP + X86FSTENV32P.FSW]
2592 and T0, X86_FSW_TOP_MASK
2593 or T0, T1
2594 mov [xSP + X86FSTENV32P.FSW], T0_16
2595
2596 fldenv [xSP]
2597%endmacro
2598
2599
2600;;
2601; Initialize the FPU for the actual instruction being emulated, this means
2602; loading parts of the guest's control word, status word, and update the
2603; tag word for the top register if it's empty.
2604;
2605; ASSUMES actual TOP=7
2606;
2607; @uses 24 bytes of stack. T0, T1
2608; @param 1 Expression giving the address of the FXSTATE of the guest.
2609;
2610%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2611 fnstenv [xSP]
2612
2613 ; FCW - for exception, precision and rounding control.
2614 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2615 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2616 mov [xSP + X86FSTENV32P.FCW], T0_16
2617
2618 ; FSW - for undefined C0, C1, C2, and C3.
2619 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2620 and T1_32, X86_FSW_C_MASK
2621 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2622 and T0_32, X86_FSW_TOP_MASK
2623 or T0_32, T1_32
2624 mov [xSP + X86FSTENV32P.FSW], T0_16
2625
2626 ; FTW - Only for ST0 (in/out).
2627 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2628 shr T1_32, X86_FSW_TOP_SHIFT
2629 and T1_32, X86_FSW_TOP_SMASK
2630 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2631 jc %%st0_not_empty
2632 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2633%%st0_not_empty:
2634
2635 fldenv [xSP]
2636%endmacro
2637
2638
2639;;
2640; Need to move this as well somewhere better?
2641;
2642struc IEMFPURESULT
2643 .r80Result resw 5
2644 .FSW resw 1
2645endstruc
2646
2647
2648;;
2649; Need to move this as well somewhere better?
2650;
2651struc IEMFPURESULTTWO
2652 .r80Result1 resw 5
2653 .FSW resw 1
2654 .r80Result2 resw 5
2655endstruc
2656
2657
2658;
2659;---------------------- 16-bit signed integer operations ----------------------
2660;
2661
2662
2663;;
2664; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2665;
2666; @param A0 FPU context (fxsave).
2667; @param A1 Pointer to a IEMFPURESULT for the output.
2668; @param A2 Pointer to the 16-bit floating point value to convert.
2669;
2670BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2671 PROLOGUE_3_ARGS
2672 sub xSP, 20h
2673
2674 fninit
2675 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2676 fild word [A2]
2677
2678 fnstsw word [A1 + IEMFPURESULT.FSW]
2679 fnclex
2680 fstp tword [A1 + IEMFPURESULT.r80Result]
2681
2682 fninit
2683 add xSP, 20h
2684 EPILOGUE_3_ARGS
2685ENDPROC iemAImpl_fild_r80_from_i16
2686
2687
2688;;
2689; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2690;
2691; @param A0 FPU context (fxsave).
2692; @param A1 Where to return the output FSW.
2693; @param A2 Where to store the 16-bit signed integer value.
2694; @param A3 Pointer to the 80-bit value.
2695;
2696BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2697 PROLOGUE_4_ARGS
2698 sub xSP, 20h
2699
2700 fninit
2701 fld tword [A3]
2702 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2703 fistp word [A2]
2704
2705 fnstsw word [A1]
2706
2707 fninit
2708 add xSP, 20h
2709 EPILOGUE_4_ARGS
2710ENDPROC iemAImpl_fist_r80_to_i16
2711
2712
2713;;
2714; Store a 80-bit floating point value (register) as a 16-bit signed integer
2715; (memory) with truncation.
2716;
2717; @param A0 FPU context (fxsave).
2718; @param A1 Where to return the output FSW.
2719; @param A2 Where to store the 16-bit signed integer value.
2720; @param A3 Pointer to the 80-bit value.
2721;
2722BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2723 PROLOGUE_4_ARGS
2724 sub xSP, 20h
2725
2726 fninit
2727 fld tword [A3]
2728 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2729 fisttp word [A2]
2730
2731 fnstsw word [A1]
2732
2733 fninit
2734 add xSP, 20h
2735 EPILOGUE_4_ARGS
2736ENDPROC iemAImpl_fistt_r80_to_i16
2737
2738
2739;;
2740; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2741;
2742; @param 1 The instruction
2743;
2744; @param A0 FPU context (fxsave).
2745; @param A1 Pointer to a IEMFPURESULT for the output.
2746; @param A2 Pointer to the 80-bit value.
2747; @param A3 Pointer to the 16-bit value.
2748;
2749%macro IEMIMPL_FPU_R80_BY_I16 1
2750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2751 PROLOGUE_4_ARGS
2752 sub xSP, 20h
2753
2754 fninit
2755 fld tword [A2]
2756 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2757 %1 word [A3]
2758
2759 fnstsw word [A1 + IEMFPURESULT.FSW]
2760 fnclex
2761 fstp tword [A1 + IEMFPURESULT.r80Result]
2762
2763 fninit
2764 add xSP, 20h
2765 EPILOGUE_4_ARGS
2766ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2767%endmacro
2768
2769IEMIMPL_FPU_R80_BY_I16 fiadd
2770IEMIMPL_FPU_R80_BY_I16 fimul
2771IEMIMPL_FPU_R80_BY_I16 fisub
2772IEMIMPL_FPU_R80_BY_I16 fisubr
2773IEMIMPL_FPU_R80_BY_I16 fidiv
2774IEMIMPL_FPU_R80_BY_I16 fidivr
2775
2776
2777;;
2778; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2779; only returning FSW.
2780;
2781; @param 1 The instruction
2782;
2783; @param A0 FPU context (fxsave).
2784; @param A1 Where to store the output FSW.
2785; @param A2 Pointer to the 80-bit value.
2786; @param A3 Pointer to the 64-bit value.
2787;
2788%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2790 PROLOGUE_4_ARGS
2791 sub xSP, 20h
2792
2793 fninit
2794 fld tword [A2]
2795 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2796 %1 word [A3]
2797
2798 fnstsw word [A1]
2799
2800 fninit
2801 add xSP, 20h
2802 EPILOGUE_4_ARGS
2803ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2804%endmacro
2805
2806IEMIMPL_FPU_R80_BY_I16_FSW ficom
2807
2808
2809
2810;
2811;---------------------- 32-bit signed integer operations ----------------------
2812;
2813
2814
2815;;
2816; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2817;
2818; @param A0 FPU context (fxsave).
2819; @param A1 Pointer to a IEMFPURESULT for the output.
2820; @param A2 Pointer to the 32-bit floating point value to convert.
2821;
2822BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2823 PROLOGUE_3_ARGS
2824 sub xSP, 20h
2825
2826 fninit
2827 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2828 fild dword [A2]
2829
2830 fnstsw word [A1 + IEMFPURESULT.FSW]
2831 fnclex
2832 fstp tword [A1 + IEMFPURESULT.r80Result]
2833
2834 fninit
2835 add xSP, 20h
2836 EPILOGUE_3_ARGS
2837ENDPROC iemAImpl_fild_r80_from_i32
2838
2839
2840;;
2841; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2842;
2843; @param A0 FPU context (fxsave).
2844; @param A1 Where to return the output FSW.
2845; @param A2 Where to store the 32-bit signed integer value.
2846; @param A3 Pointer to the 80-bit value.
2847;
2848BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2849 PROLOGUE_4_ARGS
2850 sub xSP, 20h
2851
2852 fninit
2853 fld tword [A3]
2854 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2855 fistp dword [A2]
2856
2857 fnstsw word [A1]
2858
2859 fninit
2860 add xSP, 20h
2861 EPILOGUE_4_ARGS
2862ENDPROC iemAImpl_fist_r80_to_i32
2863
2864
2865;;
2866; Store a 80-bit floating point value (register) as a 32-bit signed integer
2867; (memory) with truncation.
2868;
2869; @param A0 FPU context (fxsave).
2870; @param A1 Where to return the output FSW.
2871; @param A2 Where to store the 32-bit signed integer value.
2872; @param A3 Pointer to the 80-bit value.
2873;
2874BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2875 PROLOGUE_4_ARGS
2876 sub xSP, 20h
2877
2878 fninit
2879 fld tword [A3]
2880 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2881 fisttp dword [A2]
2882
2883 fnstsw word [A1]
2884
2885 fninit
2886 add xSP, 20h
2887 EPILOGUE_4_ARGS
2888ENDPROC iemAImpl_fistt_r80_to_i32
2889
2890
2891;;
2892; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2893;
2894; @param 1 The instruction
2895;
2896; @param A0 FPU context (fxsave).
2897; @param A1 Pointer to a IEMFPURESULT for the output.
2898; @param A2 Pointer to the 80-bit value.
2899; @param A3 Pointer to the 32-bit value.
2900;
2901%macro IEMIMPL_FPU_R80_BY_I32 1
2902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2903 PROLOGUE_4_ARGS
2904 sub xSP, 20h
2905
2906 fninit
2907 fld tword [A2]
2908 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2909 %1 dword [A3]
2910
2911 fnstsw word [A1 + IEMFPURESULT.FSW]
2912 fnclex
2913 fstp tword [A1 + IEMFPURESULT.r80Result]
2914
2915 fninit
2916 add xSP, 20h
2917 EPILOGUE_4_ARGS
2918ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2919%endmacro
2920
2921IEMIMPL_FPU_R80_BY_I32 fiadd
2922IEMIMPL_FPU_R80_BY_I32 fimul
2923IEMIMPL_FPU_R80_BY_I32 fisub
2924IEMIMPL_FPU_R80_BY_I32 fisubr
2925IEMIMPL_FPU_R80_BY_I32 fidiv
2926IEMIMPL_FPU_R80_BY_I32 fidivr
2927
2928
2929;;
2930; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2931; only returning FSW.
2932;
2933; @param 1 The instruction
2934;
2935; @param A0 FPU context (fxsave).
2936; @param A1 Where to store the output FSW.
2937; @param A2 Pointer to the 80-bit value.
2938; @param A3 Pointer to the 64-bit value.
2939;
2940%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2942 PROLOGUE_4_ARGS
2943 sub xSP, 20h
2944
2945 fninit
2946 fld tword [A2]
2947 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2948 %1 dword [A3]
2949
2950 fnstsw word [A1]
2951
2952 fninit
2953 add xSP, 20h
2954 EPILOGUE_4_ARGS
2955ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2956%endmacro
2957
2958IEMIMPL_FPU_R80_BY_I32_FSW ficom
2959
2960
2961
2962;
2963;---------------------- 64-bit signed integer operations ----------------------
2964;
2965
2966
2967;;
2968; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2969;
2970; @param A0 FPU context (fxsave).
2971; @param A1 Pointer to a IEMFPURESULT for the output.
2972; @param A2 Pointer to the 64-bit floating point value to convert.
2973;
2974BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2975 PROLOGUE_3_ARGS
2976 sub xSP, 20h
2977
2978 fninit
2979 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2980 fild qword [A2]
2981
2982 fnstsw word [A1 + IEMFPURESULT.FSW]
2983 fnclex
2984 fstp tword [A1 + IEMFPURESULT.r80Result]
2985
2986 fninit
2987 add xSP, 20h
2988 EPILOGUE_3_ARGS
2989ENDPROC iemAImpl_fild_r80_from_i64
2990
2991
2992;;
2993; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2994;
2995; @param A0 FPU context (fxsave).
2996; @param A1 Where to return the output FSW.
2997; @param A2 Where to store the 64-bit signed integer value.
2998; @param A3 Pointer to the 80-bit value.
2999;
3000BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3001 PROLOGUE_4_ARGS
3002 sub xSP, 20h
3003
3004 fninit
3005 fld tword [A3]
3006 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3007 fistp qword [A2]
3008
3009 fnstsw word [A1]
3010
3011 fninit
3012 add xSP, 20h
3013 EPILOGUE_4_ARGS
3014ENDPROC iemAImpl_fist_r80_to_i64
3015
3016
3017;;
3018; Store a 80-bit floating point value (register) as a 64-bit signed integer
3019; (memory) with truncation.
3020;
3021; @param A0 FPU context (fxsave).
3022; @param A1 Where to return the output FSW.
3023; @param A2 Where to store the 64-bit signed integer value.
3024; @param A3 Pointer to the 80-bit value.
3025;
3026BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3027 PROLOGUE_4_ARGS
3028 sub xSP, 20h
3029
3030 fninit
3031 fld tword [A3]
3032 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3033 fisttp qword [A2]
3034
3035 fnstsw word [A1]
3036
3037 fninit
3038 add xSP, 20h
3039 EPILOGUE_4_ARGS
3040ENDPROC iemAImpl_fistt_r80_to_i64
3041
3042
3043
3044;
3045;---------------------- 32-bit floating point operations ----------------------
3046;
3047
3048;;
3049; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3050;
3051; @param A0 FPU context (fxsave).
3052; @param A1 Pointer to a IEMFPURESULT for the output.
3053; @param A2 Pointer to the 32-bit floating point value to convert.
3054;
3055BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3056 PROLOGUE_3_ARGS
3057 sub xSP, 20h
3058
3059 fninit
3060 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3061 fld dword [A2]
3062
3063 fnstsw word [A1 + IEMFPURESULT.FSW]
3064 fnclex
3065 fstp tword [A1 + IEMFPURESULT.r80Result]
3066
3067 fninit
3068 add xSP, 20h
3069 EPILOGUE_3_ARGS
3070ENDPROC iemAImpl_fld_r80_from_r32
3071
3072
3073;;
3074; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3075;
3076; @param A0 FPU context (fxsave).
3077; @param A1 Where to return the output FSW.
3078; @param A2 Where to store the 32-bit value.
3079; @param A3 Pointer to the 80-bit value.
3080;
3081BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3082 PROLOGUE_4_ARGS
3083 sub xSP, 20h
3084
3085 fninit
3086 fld tword [A3]
3087 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3088 fst dword [A2]
3089
3090 fnstsw word [A1]
3091
3092 fninit
3093 add xSP, 20h
3094 EPILOGUE_4_ARGS
3095ENDPROC iemAImpl_fst_r80_to_r32
3096
3097
3098;;
3099; FPU instruction working on one 80-bit and one 32-bit floating point value.
3100;
3101; @param 1 The instruction
3102;
3103; @param A0 FPU context (fxsave).
3104; @param A1 Pointer to a IEMFPURESULT for the output.
3105; @param A2 Pointer to the 80-bit value.
3106; @param A3 Pointer to the 32-bit value.
3107;
3108%macro IEMIMPL_FPU_R80_BY_R32 1
3109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3110 PROLOGUE_4_ARGS
3111 sub xSP, 20h
3112
3113 fninit
3114 fld tword [A2]
3115 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3116 %1 dword [A3]
3117
3118 fnstsw word [A1 + IEMFPURESULT.FSW]
3119 fnclex
3120 fstp tword [A1 + IEMFPURESULT.r80Result]
3121
3122 fninit
3123 add xSP, 20h
3124 EPILOGUE_4_ARGS
3125ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3126%endmacro
3127
3128IEMIMPL_FPU_R80_BY_R32 fadd
3129IEMIMPL_FPU_R80_BY_R32 fmul
3130IEMIMPL_FPU_R80_BY_R32 fsub
3131IEMIMPL_FPU_R80_BY_R32 fsubr
3132IEMIMPL_FPU_R80_BY_R32 fdiv
3133IEMIMPL_FPU_R80_BY_R32 fdivr
3134
3135
3136;;
3137; FPU instruction working on one 80-bit and one 32-bit floating point value,
3138; only returning FSW.
3139;
3140; @param 1 The instruction
3141;
3142; @param A0 FPU context (fxsave).
3143; @param A1 Where to store the output FSW.
3144; @param A2 Pointer to the 80-bit value.
3145; @param A3 Pointer to the 64-bit value.
3146;
3147%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3148BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3149 PROLOGUE_4_ARGS
3150 sub xSP, 20h
3151
3152 fninit
3153 fld tword [A2]
3154 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3155 %1 dword [A3]
3156
3157 fnstsw word [A1]
3158
3159 fninit
3160 add xSP, 20h
3161 EPILOGUE_4_ARGS
3162ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3163%endmacro
3164
3165IEMIMPL_FPU_R80_BY_R32_FSW fcom
3166
3167
3168
3169;
3170;---------------------- 64-bit floating point operations ----------------------
3171;
3172
3173;;
3174; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3175;
3176; @param A0 FPU context (fxsave).
3177; @param A1 Pointer to a IEMFPURESULT for the output.
3178; @param A2 Pointer to the 64-bit floating point value to convert.
3179;
3180BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3181 PROLOGUE_3_ARGS
3182 sub xSP, 20h
3183
3184 fninit
3185 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3186 fld qword [A2]
3187
3188 fnstsw word [A1 + IEMFPURESULT.FSW]
3189 fnclex
3190 fstp tword [A1 + IEMFPURESULT.r80Result]
3191
3192 fninit
3193 add xSP, 20h
3194 EPILOGUE_3_ARGS
3195ENDPROC iemAImpl_fld_r80_from_r64
3196
3197
3198;;
3199; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3200;
3201; @param A0 FPU context (fxsave).
3202; @param A1 Where to return the output FSW.
3203; @param A2 Where to store the 64-bit value.
3204; @param A3 Pointer to the 80-bit value.
3205;
3206BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3207 PROLOGUE_4_ARGS
3208 sub xSP, 20h
3209
3210 fninit
3211 fld tword [A3]
3212 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3213 fst qword [A2]
3214
3215 fnstsw word [A1]
3216
3217 fninit
3218 add xSP, 20h
3219 EPILOGUE_4_ARGS
3220ENDPROC iemAImpl_fst_r80_to_r64
3221
3222
3223;;
3224; FPU instruction working on one 80-bit and one 64-bit floating point value.
3225;
3226; @param 1 The instruction
3227;
3228; @param A0 FPU context (fxsave).
3229; @param A1 Pointer to a IEMFPURESULT for the output.
3230; @param A2 Pointer to the 80-bit value.
3231; @param A3 Pointer to the 64-bit value.
3232;
3233%macro IEMIMPL_FPU_R80_BY_R64 1
3234BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3235 PROLOGUE_4_ARGS
3236 sub xSP, 20h
3237
3238 fninit
3239 fld tword [A2]
3240 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3241 %1 qword [A3]
3242
3243 fnstsw word [A1 + IEMFPURESULT.FSW]
3244 fnclex
3245 fstp tword [A1 + IEMFPURESULT.r80Result]
3246
3247 fninit
3248 add xSP, 20h
3249 EPILOGUE_4_ARGS
3250ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3251%endmacro
3252
3253IEMIMPL_FPU_R80_BY_R64 fadd
3254IEMIMPL_FPU_R80_BY_R64 fmul
3255IEMIMPL_FPU_R80_BY_R64 fsub
3256IEMIMPL_FPU_R80_BY_R64 fsubr
3257IEMIMPL_FPU_R80_BY_R64 fdiv
3258IEMIMPL_FPU_R80_BY_R64 fdivr
3259
3260;;
3261; FPU instruction working on one 80-bit and one 64-bit floating point value,
3262; only returning FSW.
3263;
3264; @param 1 The instruction
3265;
3266; @param A0 FPU context (fxsave).
3267; @param A1 Where to store the output FSW.
3268; @param A2 Pointer to the 80-bit value.
3269; @param A3 Pointer to the 64-bit value.
3270;
3271%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3273 PROLOGUE_4_ARGS
3274 sub xSP, 20h
3275
3276 fninit
3277 fld tword [A2]
3278 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3279 %1 qword [A3]
3280
3281 fnstsw word [A1]
3282
3283 fninit
3284 add xSP, 20h
3285 EPILOGUE_4_ARGS
3286ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3287%endmacro
3288
3289IEMIMPL_FPU_R80_BY_R64_FSW fcom
3290
3291
3292
3293;
3294;---------------------- 80-bit floating point operations ----------------------
3295;
3296
3297;;
3298; Loads a 80-bit floating point register value from memory.
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the 80-bit floating point value to load.
3303;
3304BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3305 PROLOGUE_3_ARGS
3306 sub xSP, 20h
3307
3308 fninit
3309 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3310 fld tword [A2]
3311
3312 fnstsw word [A1 + IEMFPURESULT.FSW]
3313 fnclex
3314 fstp tword [A1 + IEMFPURESULT.r80Result]
3315
3316 fninit
3317 add xSP, 20h
3318 EPILOGUE_3_ARGS
3319ENDPROC iemAImpl_fld_r80_from_r80
3320
3321
3322;;
3323; Store a 80-bit floating point register to memory
3324;
3325; @param A0 FPU context (fxsave).
3326; @param A1 Where to return the output FSW.
3327; @param A2 Where to store the 80-bit value.
3328; @param A3 Pointer to the 80-bit register value.
3329;
3330BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3331 PROLOGUE_4_ARGS
3332 sub xSP, 20h
3333
3334 fninit
3335 fld tword [A3]
3336 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3337 fstp tword [A2]
3338
3339 fnstsw word [A1]
3340
3341 fninit
3342 add xSP, 20h
3343 EPILOGUE_4_ARGS
3344ENDPROC iemAImpl_fst_r80_to_r80
3345
3346
3347;;
3348; Loads an 80-bit floating point register value in BCD format from memory.
3349;
3350; @param A0 FPU context (fxsave).
3351; @param A1 Pointer to a IEMFPURESULT for the output.
3352; @param A2 Pointer to the 80-bit BCD value to load.
3353;
3354BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3355 PROLOGUE_3_ARGS
3356 sub xSP, 20h
3357
3358 fninit
3359 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3360 fbld tword [A2]
3361
3362 fnstsw word [A1 + IEMFPURESULT.FSW]
3363 fnclex
3364 fstp tword [A1 + IEMFPURESULT.r80Result]
3365
3366 fninit
3367 add xSP, 20h
3368 EPILOGUE_3_ARGS
3369ENDPROC iemAImpl_fld_r80_from_d80
3370
3371
3372;;
3373; Store a 80-bit floating point register to memory as BCD
3374;
3375; @param A0 FPU context (fxsave).
3376; @param A1 Where to return the output FSW.
3377; @param A2 Where to store the 80-bit BCD value.
3378; @param A3 Pointer to the 80-bit register value.
3379;
3380BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3381 PROLOGUE_4_ARGS
3382 sub xSP, 20h
3383
3384 fninit
3385 fld tword [A3]
3386 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3387 fbstp tword [A2]
3388
3389 fnstsw word [A1]
3390
3391 fninit
3392 add xSP, 20h
3393 EPILOGUE_4_ARGS
3394ENDPROC iemAImpl_fst_r80_to_d80
3395
3396
3397;;
3398; FPU instruction working on two 80-bit floating point values.
3399;
3400; @param 1 The instruction
3401;
3402; @param A0 FPU context (fxsave).
3403; @param A1 Pointer to a IEMFPURESULT for the output.
3404; @param A2 Pointer to the first 80-bit value (ST0)
3405; @param A3 Pointer to the second 80-bit value (STn).
3406;
3407%macro IEMIMPL_FPU_R80_BY_R80 2
3408BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3409 PROLOGUE_4_ARGS
3410 sub xSP, 20h
3411
3412 fninit
3413 fld tword [A3]
3414 fld tword [A2]
3415 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3416 %1 %2
3417
3418 fnstsw word [A1 + IEMFPURESULT.FSW]
3419 fnclex
3420 fstp tword [A1 + IEMFPURESULT.r80Result]
3421
3422 fninit
3423 add xSP, 20h
3424 EPILOGUE_4_ARGS
3425ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3426%endmacro
3427
3428IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3429IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3430IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3431IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3432IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3433IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3434IEMIMPL_FPU_R80_BY_R80 fprem, {}
3435IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3436IEMIMPL_FPU_R80_BY_R80 fscale, {}
3437
3438
3439;;
3440; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3441; storing the result in ST1 and popping the stack.
3442;
3443; @param 1 The instruction
3444;
3445; @param A0 FPU context (fxsave).
3446; @param A1 Pointer to a IEMFPURESULT for the output.
3447; @param A2 Pointer to the first 80-bit value (ST1).
3448; @param A3 Pointer to the second 80-bit value (ST0).
3449;
3450%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3451BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3452 PROLOGUE_4_ARGS
3453 sub xSP, 20h
3454
3455 fninit
3456 fld tword [A2]
3457 fld tword [A3]
3458 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3459 %1
3460
3461 fnstsw word [A1 + IEMFPURESULT.FSW]
3462 fnclex
3463 fstp tword [A1 + IEMFPURESULT.r80Result]
3464
3465 fninit
3466 add xSP, 20h
3467 EPILOGUE_4_ARGS
3468ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3469%endmacro
3470
3471IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3472IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3473IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3474
3475
3476;;
3477; FPU instruction working on two 80-bit floating point values, only
3478; returning FSW.
3479;
3480; @param 1 The instruction
3481;
3482; @param A0 FPU context (fxsave).
3483; @param A1 Pointer to a uint16_t for the resulting FSW.
3484; @param A2 Pointer to the first 80-bit value.
3485; @param A3 Pointer to the second 80-bit value.
3486;
3487%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3488BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3489 PROLOGUE_4_ARGS
3490 sub xSP, 20h
3491
3492 fninit
3493 fld tword [A3]
3494 fld tword [A2]
3495 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3496 %1 st0, st1
3497
3498 fnstsw word [A1]
3499
3500 fninit
3501 add xSP, 20h
3502 EPILOGUE_4_ARGS
3503ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3504%endmacro
3505
3506IEMIMPL_FPU_R80_BY_R80_FSW fcom
3507IEMIMPL_FPU_R80_BY_R80_FSW fucom
3508
3509
3510;;
3511; FPU instruction working on two 80-bit floating point values,
3512; returning FSW and EFLAGS (eax).
3513;
3514; @param 1 The instruction
3515;
3516; @returns EFLAGS in EAX.
3517; @param A0 FPU context (fxsave).
3518; @param A1 Pointer to a uint16_t for the resulting FSW.
3519; @param A2 Pointer to the first 80-bit value.
3520; @param A3 Pointer to the second 80-bit value.
3521;
3522%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3524 PROLOGUE_4_ARGS
3525 sub xSP, 20h
3526
3527 fninit
3528 fld tword [A3]
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1 st1
3532
3533 fnstsw word [A1]
3534 pushf
3535 pop xAX
3536
3537 fninit
3538 add xSP, 20h
3539 EPILOGUE_4_ARGS
3540ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3541%endmacro
3542
3543IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3544IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3545
3546
3547;;
3548; FPU instruction working on one 80-bit floating point value.
3549;
3550; @param 1 The instruction
3551;
3552; @param A0 FPU context (fxsave).
3553; @param A1 Pointer to a IEMFPURESULT for the output.
3554; @param A2 Pointer to the 80-bit value.
3555;
3556%macro IEMIMPL_FPU_R80 1
3557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3558 PROLOGUE_3_ARGS
3559 sub xSP, 20h
3560
3561 fninit
3562 fld tword [A2]
3563 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3564 %1
3565
3566 fnstsw word [A1 + IEMFPURESULT.FSW]
3567 fnclex
3568 fstp tword [A1 + IEMFPURESULT.r80Result]
3569
3570 fninit
3571 add xSP, 20h
3572 EPILOGUE_3_ARGS
3573ENDPROC iemAImpl_ %+ %1 %+ _r80
3574%endmacro
3575
3576IEMIMPL_FPU_R80 fchs
3577IEMIMPL_FPU_R80 fabs
3578IEMIMPL_FPU_R80 f2xm1
3579IEMIMPL_FPU_R80 fsqrt
3580IEMIMPL_FPU_R80 frndint
3581IEMIMPL_FPU_R80 fsin
3582IEMIMPL_FPU_R80 fcos
3583
3584
3585;;
3586; FPU instruction working on one 80-bit floating point value, only
3587; returning FSW.
3588;
3589; @param 1 The instruction
3590; @param 2 Non-zero to also restore FTW.
3591;
3592; @param A0 FPU context (fxsave).
3593; @param A1 Pointer to a uint16_t for the resulting FSW.
3594; @param A2 Pointer to the 80-bit value.
3595;
3596%macro IEMIMPL_FPU_R80_FSW 2
3597BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3598 PROLOGUE_3_ARGS
3599 sub xSP, 20h
3600
3601 fninit
3602 fld tword [A2]
3603%if %2 != 0
3604 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3605%else
3606 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3607%endif
3608 %1
3609
3610 fnstsw word [A1]
3611
3612 fninit
3613 add xSP, 20h
3614 EPILOGUE_3_ARGS
3615ENDPROC iemAImpl_ %+ %1 %+ _r80
3616%endmacro
3617
3618IEMIMPL_FPU_R80_FSW ftst, 0
3619IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3620
3621
3622
3623;;
3624; FPU instruction loading a 80-bit floating point constant.
3625;
3626; @param 1 The instruction
3627;
3628; @param A0 FPU context (fxsave).
3629; @param A1 Pointer to a IEMFPURESULT for the output.
3630;
3631%macro IEMIMPL_FPU_R80_CONST 1
3632BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3633 PROLOGUE_2_ARGS
3634 sub xSP, 20h
3635
3636 fninit
3637 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3638 %1
3639
3640 fnstsw word [A1 + IEMFPURESULT.FSW]
3641 fnclex
3642 fstp tword [A1 + IEMFPURESULT.r80Result]
3643
3644 fninit
3645 add xSP, 20h
3646 EPILOGUE_2_ARGS
3647ENDPROC iemAImpl_ %+ %1 %+
3648%endmacro
3649
3650IEMIMPL_FPU_R80_CONST fld1
3651IEMIMPL_FPU_R80_CONST fldl2t
3652IEMIMPL_FPU_R80_CONST fldl2e
3653IEMIMPL_FPU_R80_CONST fldpi
3654IEMIMPL_FPU_R80_CONST fldlg2
3655IEMIMPL_FPU_R80_CONST fldln2
3656IEMIMPL_FPU_R80_CONST fldz
3657
3658
3659;;
3660; FPU instruction working on one 80-bit floating point value, outputing two.
3661;
3662; @param 1 The instruction
3663;
3664; @param A0 FPU context (fxsave).
3665; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3666; @param A2 Pointer to the 80-bit value.
3667;
3668%macro IEMIMPL_FPU_R80_R80 1
3669BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3670 PROLOGUE_3_ARGS
3671 sub xSP, 20h
3672
3673 fninit
3674 fld tword [A2]
3675 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3676 %1
3677
3678 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3679 fnclex
3680 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3681 fnclex
3682 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3683
3684 fninit
3685 add xSP, 20h
3686 EPILOGUE_3_ARGS
3687ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3688%endmacro
3689
3690IEMIMPL_FPU_R80_R80 fptan
3691IEMIMPL_FPU_R80_R80 fxtract
3692IEMIMPL_FPU_R80_R80 fsincos
3693
3694
3695
3696
3697;---------------------- SSE and MMX Operations ----------------------
3698
3699;; @todo what do we need to do for MMX?
3700%macro IEMIMPL_MMX_PROLOGUE 0
3701%endmacro
3702%macro IEMIMPL_MMX_EPILOGUE 0
3703%endmacro
3704
3705;; @todo what do we need to do for SSE?
3706%macro IEMIMPL_SSE_PROLOGUE 0
3707%endmacro
3708%macro IEMIMPL_SSE_EPILOGUE 0
3709%endmacro
3710
3711;; @todo what do we need to do for AVX?
3712%macro IEMIMPL_AVX_PROLOGUE 0
3713%endmacro
3714%macro IEMIMPL_AVX_EPILOGUE 0
3715%endmacro
3716
3717
3718;;
3719; Media instruction working on two full sized registers.
3720;
3721; @param 1 The instruction
3722; @param 2 Whether there is an MMX variant (1) or not (0).
3723;
3724; @param A0 FPU context (fxsave).
3725; @param A1 Pointer to the first media register size operand (input/output).
3726; @param A2 Pointer to the second media register size operand (input).
3727;
3728%macro IEMIMPL_MEDIA_F2 2
3729%if %2 != 0
3730BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3731 PROLOGUE_3_ARGS
3732 IEMIMPL_MMX_PROLOGUE
3733
3734 movq mm0, [A1]
3735 movq mm1, [A2]
3736 %1 mm0, mm1
3737 movq [A1], mm0
3738
3739 IEMIMPL_MMX_EPILOGUE
3740 EPILOGUE_3_ARGS
3741ENDPROC iemAImpl_ %+ %1 %+ _u64
3742%endif
3743
3744BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3745 PROLOGUE_3_ARGS
3746 IEMIMPL_SSE_PROLOGUE
3747
3748 movdqu xmm0, [A1]
3749 movdqu xmm1, [A2]
3750 %1 xmm0, xmm1
3751 movdqu [A1], xmm0
3752
3753 IEMIMPL_SSE_EPILOGUE
3754 EPILOGUE_3_ARGS
3755ENDPROC iemAImpl_ %+ %1 %+ _u128
3756%endmacro
3757
3758IEMIMPL_MEDIA_F2 pshufb, 1
3759IEMIMPL_MEDIA_F2 pand, 1
3760IEMIMPL_MEDIA_F2 pandn, 1
3761IEMIMPL_MEDIA_F2 por, 1
3762IEMIMPL_MEDIA_F2 pxor, 1
3763IEMIMPL_MEDIA_F2 pcmpeqb, 1
3764IEMIMPL_MEDIA_F2 pcmpeqw, 1
3765IEMIMPL_MEDIA_F2 pcmpeqd, 1
3766IEMIMPL_MEDIA_F2 pcmpeqq, 0
3767IEMIMPL_MEDIA_F2 pcmpgtb, 1
3768IEMIMPL_MEDIA_F2 pcmpgtw, 1
3769IEMIMPL_MEDIA_F2 pcmpgtd, 1
3770IEMIMPL_MEDIA_F2 pcmpgtq, 0
3771IEMIMPL_MEDIA_F2 paddb, 1
3772IEMIMPL_MEDIA_F2 paddw, 1
3773IEMIMPL_MEDIA_F2 paddd, 1
3774IEMIMPL_MEDIA_F2 paddq, 1
3775IEMIMPL_MEDIA_F2 paddsb, 1
3776IEMIMPL_MEDIA_F2 paddsw, 1
3777IEMIMPL_MEDIA_F2 paddusb, 1
3778IEMIMPL_MEDIA_F2 paddusw, 1
3779IEMIMPL_MEDIA_F2 psubb, 1
3780IEMIMPL_MEDIA_F2 psubw, 1
3781IEMIMPL_MEDIA_F2 psubd, 1
3782IEMIMPL_MEDIA_F2 psubq, 1
3783IEMIMPL_MEDIA_F2 psubsb, 1
3784IEMIMPL_MEDIA_F2 psubsw, 1
3785IEMIMPL_MEDIA_F2 psubusb, 1
3786IEMIMPL_MEDIA_F2 psubusw, 1
3787IEMIMPL_MEDIA_F2 pmullw, 1
3788IEMIMPL_MEDIA_F2 pmulld, 0
3789IEMIMPL_MEDIA_F2 pmulhw, 1
3790IEMIMPL_MEDIA_F2 pmaddwd, 1
3791IEMIMPL_MEDIA_F2 pminub, 1
3792IEMIMPL_MEDIA_F2 pminuw, 0
3793IEMIMPL_MEDIA_F2 pminud, 0
3794IEMIMPL_MEDIA_F2 pminsb, 0
3795IEMIMPL_MEDIA_F2 pminsw, 1
3796IEMIMPL_MEDIA_F2 pminsd, 0
3797IEMIMPL_MEDIA_F2 pmaxub, 1
3798IEMIMPL_MEDIA_F2 pmaxuw, 0
3799IEMIMPL_MEDIA_F2 pmaxud, 0
3800IEMIMPL_MEDIA_F2 pmaxsb, 0
3801IEMIMPL_MEDIA_F2 pmaxsw, 1
3802IEMIMPL_MEDIA_F2 pmaxsd, 0
3803IEMIMPL_MEDIA_F2 pabsb, 1
3804IEMIMPL_MEDIA_F2 pabsw, 1
3805IEMIMPL_MEDIA_F2 pabsd, 1
3806IEMIMPL_MEDIA_F2 psignb, 1
3807IEMIMPL_MEDIA_F2 psignw, 1
3808IEMIMPL_MEDIA_F2 psignd, 1
3809IEMIMPL_MEDIA_F2 phaddw, 1
3810IEMIMPL_MEDIA_F2 phaddd, 1
3811IEMIMPL_MEDIA_F2 phsubw, 1
3812IEMIMPL_MEDIA_F2 phsubd, 1
3813IEMIMPL_MEDIA_F2 phaddsw, 1
3814IEMIMPL_MEDIA_F2 phsubsw, 1
3815IEMIMPL_MEDIA_F2 pmaddubsw, 1
3816IEMIMPL_MEDIA_F2 pmulhrsw, 1
3817IEMIMPL_MEDIA_F2 pmuludq, 1
3818
3819
3820;;
3821; Media instruction working on two full sized registers, but no FXSAVE state argument.
3822;
3823; @param 1 The instruction
3824; @param 2 Whether there is an MMX variant (1) or not (0).
3825;
3826; @param A0 Pointer to the first media register size operand (input/output).
3827; @param A1 Pointer to the second media register size operand (input).
3828;
3829%macro IEMIMPL_MEDIA_OPT_F2 2
3830%if %2 != 0
3831BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3832 PROLOGUE_2_ARGS
3833 IEMIMPL_MMX_PROLOGUE
3834
3835 movq mm0, [A0]
3836 movq mm1, [A1]
3837 %1 mm0, mm1
3838 movq [A0], mm0
3839
3840 IEMIMPL_MMX_EPILOGUE
3841 EPILOGUE_2_ARGS
3842ENDPROC iemAImpl_ %+ %1 %+ _u64
3843%endif
3844
3845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3846 PROLOGUE_2_ARGS
3847 IEMIMPL_SSE_PROLOGUE
3848
3849 movdqu xmm0, [A0]
3850 movdqu xmm1, [A1]
3851 %1 xmm0, xmm1
3852 movdqu [A0], xmm0
3853
3854 IEMIMPL_SSE_EPILOGUE
3855 EPILOGUE_2_ARGS
3856ENDPROC iemAImpl_ %+ %1 %+ _u128
3857%endmacro
3858
3859IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3860IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3861IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3862IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3863IEMIMPL_MEDIA_OPT_F2 psllw, 1
3864IEMIMPL_MEDIA_OPT_F2 pslld, 1
3865IEMIMPL_MEDIA_OPT_F2 psllq, 1
3866IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3867IEMIMPL_MEDIA_OPT_F2 psrld, 1
3868IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3869IEMIMPL_MEDIA_OPT_F2 psraw, 1
3870IEMIMPL_MEDIA_OPT_F2 psrad, 1
3871IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3872IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3873IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3874IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3875IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3876IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3877IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3878IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3879IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3880IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3881IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3882IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3883IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3884IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3885IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3886IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3887IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3888IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3889IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3890IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3891
3892;;
3893; Media instruction working on one full sized and one half sized register (lower half).
3894;
3895; @param 1 The instruction
3896; @param 2 1 if MMX is included, 0 if not.
3897;
3898; @param A0 Pointer to the first full sized media register operand (input/output).
3899; @param A1 Pointer to the second half sized media register operand (input).
3900;
3901%macro IEMIMPL_MEDIA_F1L1 2
3902 %if %2 != 0
3903BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3904 PROLOGUE_2_ARGS
3905 IEMIMPL_MMX_PROLOGUE
3906
3907 movq mm0, [A0]
3908 movq mm1, [A1]
3909 %1 mm0, mm1
3910 movq [A0], mm0
3911
3912 IEMIMPL_MMX_EPILOGUE
3913 EPILOGUE_2_ARGS
3914ENDPROC iemAImpl_ %+ %1 %+ _u64
3915 %endif
3916
3917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3918 PROLOGUE_2_ARGS
3919 IEMIMPL_SSE_PROLOGUE
3920
3921 movdqu xmm0, [A0]
3922 movdqu xmm1, [A1]
3923 %1 xmm0, xmm1
3924 movdqu [A0], xmm0
3925
3926 IEMIMPL_SSE_EPILOGUE
3927 EPILOGUE_2_ARGS
3928ENDPROC iemAImpl_ %+ %1 %+ _u128
3929%endmacro
3930
3931IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3932IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3933IEMIMPL_MEDIA_F1L1 punpckldq, 1
3934IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3935
3936
3937;;
3938; Media instruction working two half sized input registers (lower half) and a full sized
3939; destination register (vpunpckh*).
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 Pointer to the destination register (full sized, output only).
3944; @param A1 Pointer to the first full sized media source register operand, where we
3945; will only use the lower half as input - but we'll be loading it in full.
3946; @param A2 Pointer to the second full sized media source register operand, where we
3947; will only use the lower half as input - but we'll be loading it in full.
3948;
3949%macro IEMIMPL_MEDIA_F1L1L1 1
3950BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3951 PROLOGUE_3_ARGS
3952 IEMIMPL_AVX_PROLOGUE
3953
3954 vmovdqu xmm0, [A1]
3955 vmovdqu xmm1, [A2]
3956 %1 xmm0, xmm0, xmm1
3957 vmovdqu [A0], xmm0
3958
3959 IEMIMPL_AVX_PROLOGUE
3960 EPILOGUE_3_ARGS
3961ENDPROC iemAImpl_ %+ %1 %+ _u128
3962
3963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3964 PROLOGUE_3_ARGS
3965 IEMIMPL_AVX_PROLOGUE
3966
3967 vmovdqu ymm0, [A1]
3968 vmovdqu ymm1, [A2]
3969 %1 ymm0, ymm0, ymm1
3970 vmovdqu [A0], ymm0
3971
3972 IEMIMPL_AVX_PROLOGUE
3973 EPILOGUE_3_ARGS
3974ENDPROC iemAImpl_ %+ %1 %+ _u256
3975%endmacro
3976
3977IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3978IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3979IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3980IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3981
3982
3983;;
3984; Media instruction working on one full sized and one half sized register (high half).
3985;
3986; @param 1 The instruction
3987; @param 2 1 if MMX is included, 0 if not.
3988;
3989; @param A0 Pointer to the first full sized media register operand (input/output).
3990; @param A1 Pointer to the second full sized media register operand, where we
3991; will only use the upper half as input - but we'll load it in full.
3992;
3993%macro IEMIMPL_MEDIA_F1H1 2
3994IEMIMPL_MEDIA_F1L1 %1, %2
3995%endmacro
3996
3997IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3998IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3999IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4000IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4001
4002
4003;;
4004; Media instruction working two half sized input registers (high half) and a full sized
4005; destination register (vpunpckh*).
4006;
4007; @param 1 The instruction
4008;
4009; @param A0 Pointer to the destination register (full sized, output only).
4010; @param A1 Pointer to the first full sized media source register operand, where we
4011; will only use the upper half as input - but we'll be loading it in full.
4012; @param A2 Pointer to the second full sized media source register operand, where we
4013; will only use the upper half as input - but we'll be loading it in full.
4014;
4015%macro IEMIMPL_MEDIA_F1H1H1 1
4016IEMIMPL_MEDIA_F1L1L1 %1
4017%endmacro
4018
4019IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4020IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4021IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4022IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4023
4024
4025;
4026; Shufflers with evil 8-bit immediates.
4027;
4028
4029BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4030 PROLOGUE_3_ARGS
4031 IEMIMPL_MMX_PROLOGUE
4032
4033 movq mm1, [A1]
4034 movq mm0, mm0 ; paranoia!
4035 lea T1, [.imm0 xWrtRIP]
4036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4037 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4038 %else
4039 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4040 %endif
4041 lea T1, [T1 + T0]
4042 IBT_NOTRACK
4043 call T1
4044 movq [A0], mm0
4045
4046 IEMIMPL_MMX_EPILOGUE
4047 EPILOGUE_3_ARGS
4048%assign bImm 0
4049%rep 256
4050.imm %+ bImm:
4051 IBT_ENDBRxx_WITHOUT_NOTRACK
4052 pshufw mm0, mm1, bImm
4053 ret
4054 %assign bImm bImm + 1
4055%endrep
4056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4057ENDPROC iemAImpl_pshufw_u64
4058
4059
4060%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4061BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4062 PROLOGUE_3_ARGS
4063 IEMIMPL_SSE_PROLOGUE
4064
4065 movdqu xmm1, [A1]
4066 movdqu xmm0, xmm1 ; paranoia!
4067 lea T1, [.imm0 xWrtRIP]
4068 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4069 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4070 %else
4071 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4072 %endif
4073 lea T1, [T1 + T0*2]
4074 IBT_NOTRACK
4075 call T1
4076 movdqu [A0], xmm0
4077
4078 IEMIMPL_SSE_EPILOGUE
4079 EPILOGUE_3_ARGS
4080
4081 %assign bImm 0
4082 %rep 256
4083.imm %+ bImm:
4084 IBT_ENDBRxx_WITHOUT_NOTRACK
4085 %1 xmm0, xmm1, bImm
4086 ret
4087 %assign bImm bImm + 1
4088 %endrep
4089.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4090ENDPROC iemAImpl_ %+ %1 %+ _u128
4091%endmacro
4092
4093IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4094IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4095IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4096
4097
4098%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4099BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4100 PROLOGUE_3_ARGS
4101 IEMIMPL_SSE_PROLOGUE
4102
4103 vmovdqu ymm1, [A1]
4104 vmovdqu ymm0, ymm1 ; paranoia!
4105 lea T1, [.imm0 xWrtRIP]
4106 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4107 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4108 %else
4109 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4110 %endif
4111 lea T1, [T1 + T0*2]
4112 IBT_NOTRACK
4113 call T1
4114 vmovdqu [A0], ymm0
4115
4116 IEMIMPL_SSE_EPILOGUE
4117 EPILOGUE_3_ARGS
4118 %assign bImm 0
4119 %rep 256
4120.imm %+ bImm:
4121 IBT_ENDBRxx_WITHOUT_NOTRACK
4122 %1 ymm0, ymm1, bImm
4123 ret
4124 %assign bImm bImm + 1
4125 %endrep
4126.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4127ENDPROC iemAImpl_ %+ %1 %+ _u256
4128%endmacro
4129
4130IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4131IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4132IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4133
4134
4135;
4136; Shifts with evil 8-bit immediates.
4137;
4138
4139%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4140BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4141 PROLOGUE_2_ARGS
4142 IEMIMPL_MMX_PROLOGUE
4143
4144 movq mm0, [A0]
4145 lea T1, [.imm0 xWrtRIP]
4146 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4147 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4148 %else
4149 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4150 %endif
4151 lea T1, [T1 + T0]
4152 IBT_NOTRACK
4153 call T1
4154 movq [A0], mm0
4155
4156 IEMIMPL_MMX_EPILOGUE
4157 EPILOGUE_2_ARGS
4158%assign bImm 0
4159%rep 256
4160.imm %+ bImm:
4161 IBT_ENDBRxx_WITHOUT_NOTRACK
4162 %1 mm0, bImm
4163 ret
4164 %assign bImm bImm + 1
4165%endrep
4166.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4167ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4168%endmacro
4169
4170IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4171IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4172IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4173IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4174IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4175IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4176IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4177IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4178
4179
4180%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4181BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4182 PROLOGUE_2_ARGS
4183 IEMIMPL_SSE_PROLOGUE
4184
4185 movdqu xmm0, [A0]
4186 lea T1, [.imm0 xWrtRIP]
4187 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4188 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4189 %else
4190 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4191 %endif
4192 lea T1, [T1 + T0*2]
4193 IBT_NOTRACK
4194 call T1
4195 movdqu [A0], xmm0
4196
4197 IEMIMPL_SSE_EPILOGUE
4198 EPILOGUE_2_ARGS
4199 %assign bImm 0
4200 %rep 256
4201.imm %+ bImm:
4202 IBT_ENDBRxx_WITHOUT_NOTRACK
4203 %1 xmm0, bImm
4204 ret
4205 %assign bImm bImm + 1
4206 %endrep
4207.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4208ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4209%endmacro
4210
4211IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4212IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4213IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4214IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4215IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4216IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4217IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4218IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4219IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4220IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4221
4222
4223;
4224; Move byte mask.
4225;
4226
4227BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4228 PROLOGUE_2_ARGS
4229 IEMIMPL_MMX_PROLOGUE
4230
4231 movq mm1, [A1]
4232 pmovmskb T0, mm1
4233 mov [A0], T0
4234%ifdef RT_ARCH_X86
4235 mov dword [A0 + 4], 0
4236%endif
4237 IEMIMPL_MMX_EPILOGUE
4238 EPILOGUE_2_ARGS
4239ENDPROC iemAImpl_pmovmskb_u64
4240
4241BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4242 PROLOGUE_2_ARGS
4243 IEMIMPL_SSE_PROLOGUE
4244
4245 movdqu xmm1, [A1]
4246 pmovmskb T0, xmm1
4247 mov [A0], T0
4248%ifdef RT_ARCH_X86
4249 mov dword [A0 + 4], 0
4250%endif
4251 IEMIMPL_SSE_EPILOGUE
4252 EPILOGUE_2_ARGS
4253ENDPROC iemAImpl_pmovmskb_u128
4254
4255BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4256 PROLOGUE_2_ARGS
4257 IEMIMPL_AVX_PROLOGUE
4258
4259 vmovdqu ymm1, [A1]
4260 vpmovmskb T0, ymm1
4261 mov [A0], T0
4262%ifdef RT_ARCH_X86
4263 mov dword [A0 + 4], 0
4264%endif
4265 IEMIMPL_AVX_EPILOGUE
4266 EPILOGUE_2_ARGS
4267ENDPROC iemAImpl_vpmovmskb_u256
4268
4269
4270;;
4271; Media instruction working on two full sized source registers and one destination (AVX).
4272;
4273; @param 1 The instruction
4274;
4275; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4276; @param A1 Pointer to the destination media register size operand (output).
4277; @param A2 Pointer to the first source media register size operand (input).
4278; @param A3 Pointer to the second source media register size operand (input).
4279;
4280%macro IEMIMPL_MEDIA_F3 1
4281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4282 PROLOGUE_4_ARGS
4283 IEMIMPL_AVX_PROLOGUE
4284
4285 vmovdqu xmm0, [A2]
4286 vmovdqu xmm1, [A3]
4287 %1 xmm0, xmm0, xmm1
4288 vmovdqu [A1], xmm0
4289
4290 IEMIMPL_AVX_PROLOGUE
4291 EPILOGUE_4_ARGS
4292ENDPROC iemAImpl_ %+ %1 %+ _u128
4293
4294BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4295 PROLOGUE_4_ARGS
4296 IEMIMPL_AVX_PROLOGUE
4297
4298 vmovdqu ymm0, [A2]
4299 vmovdqu ymm1, [A3]
4300 %1 ymm0, ymm0, ymm1
4301 vmovdqu [A1], ymm0
4302
4303 IEMIMPL_AVX_PROLOGUE
4304 EPILOGUE_4_ARGS
4305ENDPROC iemAImpl_ %+ %1 %+ _u256
4306%endmacro
4307
4308IEMIMPL_MEDIA_F3 vpshufb
4309IEMIMPL_MEDIA_F3 vpand
4310IEMIMPL_MEDIA_F3 vpminub
4311IEMIMPL_MEDIA_F3 vpminuw
4312IEMIMPL_MEDIA_F3 vpminud
4313IEMIMPL_MEDIA_F3 vpminsb
4314IEMIMPL_MEDIA_F3 vpminsw
4315IEMIMPL_MEDIA_F3 vpminsd
4316IEMIMPL_MEDIA_F3 vpmaxub
4317IEMIMPL_MEDIA_F3 vpmaxuw
4318IEMIMPL_MEDIA_F3 vpmaxud
4319IEMIMPL_MEDIA_F3 vpmaxsb
4320IEMIMPL_MEDIA_F3 vpmaxsw
4321IEMIMPL_MEDIA_F3 vpmaxsd
4322IEMIMPL_MEDIA_F3 vpandn
4323IEMIMPL_MEDIA_F3 vpor
4324IEMIMPL_MEDIA_F3 vpxor
4325IEMIMPL_MEDIA_F3 vpcmpeqb
4326IEMIMPL_MEDIA_F3 vpcmpeqw
4327IEMIMPL_MEDIA_F3 vpcmpeqd
4328IEMIMPL_MEDIA_F3 vpcmpeqq
4329IEMIMPL_MEDIA_F3 vpcmpgtb
4330IEMIMPL_MEDIA_F3 vpcmpgtw
4331IEMIMPL_MEDIA_F3 vpcmpgtd
4332IEMIMPL_MEDIA_F3 vpcmpgtq
4333IEMIMPL_MEDIA_F3 vpaddb
4334IEMIMPL_MEDIA_F3 vpaddw
4335IEMIMPL_MEDIA_F3 vpaddd
4336IEMIMPL_MEDIA_F3 vpaddq
4337IEMIMPL_MEDIA_F3 vpsubb
4338IEMIMPL_MEDIA_F3 vpsubw
4339IEMIMPL_MEDIA_F3 vpsubd
4340IEMIMPL_MEDIA_F3 vpsubq
4341
4342
4343;;
4344; Media instruction working on two full sized source registers and one destination (AVX),
4345; but no XSAVE state pointer argument.
4346;
4347; @param 1 The instruction
4348;
4349; @param A0 Pointer to the destination media register size operand (output).
4350; @param A1 Pointer to the first source media register size operand (input).
4351; @param A2 Pointer to the second source media register size operand (input).
4352;
4353%macro IEMIMPL_MEDIA_OPT_F3 1
4354BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4355 PROLOGUE_3_ARGS
4356 IEMIMPL_AVX_PROLOGUE
4357
4358 vmovdqu xmm0, [A1]
4359 vmovdqu xmm1, [A2]
4360 %1 xmm0, xmm0, xmm1
4361 vmovdqu [A0], xmm0
4362
4363 IEMIMPL_AVX_PROLOGUE
4364 EPILOGUE_3_ARGS
4365ENDPROC iemAImpl_ %+ %1 %+ _u128
4366
4367BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4368 PROLOGUE_3_ARGS
4369 IEMIMPL_AVX_PROLOGUE
4370
4371 vmovdqu ymm0, [A1]
4372 vmovdqu ymm1, [A2]
4373 %1 ymm0, ymm0, ymm1
4374 vmovdqu [A0], ymm0
4375
4376 IEMIMPL_AVX_PROLOGUE
4377 EPILOGUE_3_ARGS
4378ENDPROC iemAImpl_ %+ %1 %+ _u256
4379%endmacro
4380
4381IEMIMPL_MEDIA_OPT_F3 vpacksswb
4382IEMIMPL_MEDIA_OPT_F3 vpackssdw
4383IEMIMPL_MEDIA_OPT_F3 vpackuswb
4384IEMIMPL_MEDIA_OPT_F3 vpackusdw
4385IEMIMPL_MEDIA_OPT_F3 vpmullw
4386IEMIMPL_MEDIA_OPT_F3 vpmulld
4387IEMIMPL_MEDIA_OPT_F3 vpmulhw
4388IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4389IEMIMPL_MEDIA_OPT_F3 vpavgb
4390IEMIMPL_MEDIA_OPT_F3 vpavgw
4391IEMIMPL_MEDIA_OPT_F3 vpsignb
4392IEMIMPL_MEDIA_OPT_F3 vpsignw
4393IEMIMPL_MEDIA_OPT_F3 vpsignd
4394IEMIMPL_MEDIA_OPT_F3 vphaddw
4395IEMIMPL_MEDIA_OPT_F3 vphaddd
4396IEMIMPL_MEDIA_OPT_F3 vphsubw
4397IEMIMPL_MEDIA_OPT_F3 vphsubd
4398IEMIMPL_MEDIA_OPT_F3 vphaddsw
4399IEMIMPL_MEDIA_OPT_F3 vphsubsw
4400IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4401IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4402IEMIMPL_MEDIA_OPT_F3 vpsadbw
4403IEMIMPL_MEDIA_OPT_F3 vpmuldq
4404IEMIMPL_MEDIA_OPT_F3 vpmuludq
4405IEMIMPL_MEDIA_OPT_F3 vunpcklps
4406IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4407IEMIMPL_MEDIA_OPT_F3 vunpckhps
4408IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4409IEMIMPL_MEDIA_OPT_F3 vpsubsb
4410IEMIMPL_MEDIA_OPT_F3 vpsubsw
4411IEMIMPL_MEDIA_OPT_F3 vpsubusb
4412IEMIMPL_MEDIA_OPT_F3 vpsubusw
4413IEMIMPL_MEDIA_OPT_F3 vpaddusb
4414IEMIMPL_MEDIA_OPT_F3 vpaddusw
4415IEMIMPL_MEDIA_OPT_F3 vpaddsb
4416IEMIMPL_MEDIA_OPT_F3 vpaddsw
4417
4418
4419;;
4420; Media instruction working on one full sized source registers and one destination (AVX),
4421; but no XSAVE state pointer argument.
4422;
4423; @param 1 The instruction
4424; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4425;
4426; @param A0 Pointer to the destination media register size operand (output).
4427; @param A1 Pointer to the source media register size operand (input).
4428;
4429%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4430BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4431 PROLOGUE_2_ARGS
4432 IEMIMPL_AVX_PROLOGUE
4433
4434 vmovdqu xmm0, [A1]
4435 %1 xmm0, xmm0
4436 vmovdqu [A0], xmm0
4437
4438 IEMIMPL_AVX_PROLOGUE
4439 EPILOGUE_2_ARGS
4440ENDPROC iemAImpl_ %+ %1 %+ _u128
4441
4442 %if %2 == 1
4443BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4444 PROLOGUE_2_ARGS
4445 IEMIMPL_AVX_PROLOGUE
4446
4447 vmovdqu ymm0, [A1]
4448 %1 ymm0, ymm0
4449 vmovdqu [A0], ymm0
4450
4451 IEMIMPL_AVX_PROLOGUE
4452 EPILOGUE_2_ARGS
4453ENDPROC iemAImpl_ %+ %1 %+ _u256
4454 %endif
4455%endmacro
4456
4457IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4458IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4459IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4460IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4461
4462
4463;
4464; The SSE 4.2 crc32
4465;
4466; @param A1 Pointer to the 32-bit destination.
4467; @param A2 The source operand, sized according to the suffix.
4468;
4469BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4470 PROLOGUE_2_ARGS
4471
4472 mov T0_32, [A0]
4473 crc32 T0_32, A1_8
4474 mov [A0], T0_32
4475
4476 EPILOGUE_2_ARGS
4477ENDPROC iemAImpl_crc32_u8
4478
4479BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4480 PROLOGUE_2_ARGS
4481
4482 mov T0_32, [A0]
4483 crc32 T0_32, A1_16
4484 mov [A0], T0_32
4485
4486 EPILOGUE_2_ARGS
4487ENDPROC iemAImpl_crc32_u16
4488
4489BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4490 PROLOGUE_2_ARGS
4491
4492 mov T0_32, [A0]
4493 crc32 T0_32, A1_32
4494 mov [A0], T0_32
4495
4496 EPILOGUE_2_ARGS
4497ENDPROC iemAImpl_crc32_u32
4498
4499%ifdef RT_ARCH_AMD64
4500BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4501 PROLOGUE_2_ARGS
4502
4503 mov T0_32, [A0]
4504 crc32 T0, A1
4505 mov [A0], T0_32
4506
4507 EPILOGUE_2_ARGS
4508ENDPROC iemAImpl_crc32_u64
4509%endif
4510
4511
4512;
4513; PTEST (SSE 4.1)
4514;
4515; @param A0 Pointer to the first source operand (aka readonly destination).
4516; @param A1 Pointer to the second source operand.
4517; @param A2 Pointer to the EFLAGS register.
4518;
4519BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4520 PROLOGUE_3_ARGS
4521 IEMIMPL_SSE_PROLOGUE
4522
4523 movdqu xmm0, [A0]
4524 movdqu xmm1, [A1]
4525 ptest xmm0, xmm1
4526 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4527
4528 IEMIMPL_SSE_EPILOGUE
4529 EPILOGUE_3_ARGS
4530ENDPROC iemAImpl_ptest_u128
4531
4532BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4533 PROLOGUE_3_ARGS
4534 IEMIMPL_SSE_PROLOGUE
4535
4536 vmovdqu ymm0, [A0]
4537 vmovdqu ymm1, [A1]
4538 vptest ymm0, ymm1
4539 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4540
4541 IEMIMPL_SSE_EPILOGUE
4542 EPILOGUE_3_ARGS
4543ENDPROC iemAImpl_vptest_u256
4544
4545
4546;;
4547; Template for the [v]pmov{s,z}x* instructions
4548;
4549; @param 1 The instruction
4550;
4551; @param A0 Pointer to the destination media register size operand (output).
4552; @param A1 The source operand value (input).
4553;
4554%macro IEMIMPL_V_PMOV_SZ_X 1
4555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4556 PROLOGUE_2_ARGS
4557 IEMIMPL_SSE_PROLOGUE
4558
4559 movd xmm0, A1
4560 %1 xmm0, xmm0
4561 vmovdqu [A0], xmm0
4562
4563 IEMIMPL_SSE_PROLOGUE
4564 EPILOGUE_2_ARGS
4565ENDPROC iemAImpl_ %+ %1 %+ _u128
4566
4567BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4568 PROLOGUE_2_ARGS
4569 IEMIMPL_AVX_PROLOGUE
4570
4571 movd xmm0, A1
4572 v %+ %1 xmm0, xmm0
4573 vmovdqu [A0], xmm0
4574
4575 IEMIMPL_AVX_PROLOGUE
4576 EPILOGUE_2_ARGS
4577ENDPROC iemAImpl_v %+ %1 %+ _u128
4578
4579BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4580 PROLOGUE_2_ARGS
4581 IEMIMPL_AVX_PROLOGUE
4582
4583 movdqu xmm0, [A1]
4584 v %+ %1 ymm0, xmm0
4585 vmovdqu [A0], ymm0
4586
4587 IEMIMPL_AVX_PROLOGUE
4588 EPILOGUE_2_ARGS
4589ENDPROC iemAImpl_v %+ %1 %+ _u256
4590%endmacro
4591
4592IEMIMPL_V_PMOV_SZ_X pmovsxbw
4593IEMIMPL_V_PMOV_SZ_X pmovsxbd
4594IEMIMPL_V_PMOV_SZ_X pmovsxbq
4595IEMIMPL_V_PMOV_SZ_X pmovsxwd
4596IEMIMPL_V_PMOV_SZ_X pmovsxwq
4597IEMIMPL_V_PMOV_SZ_X pmovsxdq
4598
4599IEMIMPL_V_PMOV_SZ_X pmovzxbw
4600IEMIMPL_V_PMOV_SZ_X pmovzxbd
4601IEMIMPL_V_PMOV_SZ_X pmovzxbq
4602IEMIMPL_V_PMOV_SZ_X pmovzxwd
4603IEMIMPL_V_PMOV_SZ_X pmovzxwq
4604IEMIMPL_V_PMOV_SZ_X pmovzxdq
4605
4606
4607;;
4608; Need to move this as well somewhere better?
4609;
4610struc IEMSSERESULT
4611 .uResult resd 4
4612 .MXCSR resd 1
4613endstruc
4614
4615
4616;;
4617; Need to move this as well somewhere better?
4618;
4619struc IEMAVX128RESULT
4620 .uResult resd 4
4621 .MXCSR resd 1
4622endstruc
4623
4624
4625;;
4626; Need to move this as well somewhere better?
4627;
4628struc IEMAVX256RESULT
4629 .uResult resd 8
4630 .MXCSR resd 1
4631endstruc
4632
4633
4634;;
4635; Initialize the SSE MXCSR register using the guest value partially to
4636; account for rounding mode.
4637;
4638; @uses 4 bytes of stack to save the original value, T0.
4639; @param 1 Expression giving the address of the FXSTATE of the guest.
4640;
4641%macro SSE_LD_FXSTATE_MXCSR 1
4642 sub xSP, 4
4643
4644 stmxcsr [xSP]
4645 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4646 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4647 or T0_32, X86_MXCSR_XCPT_MASK
4648 sub xSP, 4
4649 mov [xSP], T0_32
4650 ldmxcsr [xSP]
4651 add xSP, 4
4652%endmacro
4653
4654
4655;;
4656; Restores the SSE MXCSR register with the original value.
4657;
4658; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4659; @param 1 Expression giving the address where to return the MXCSR value.
4660; @param 2 Expression giving the address of the FXSTATE of the guest.
4661;
4662; @note Restores the stack pointer.
4663;
4664%macro SSE_ST_FXSTATE_MXCSR 2
4665 sub xSP, 4
4666 stmxcsr [xSP]
4667 mov T0_32, [xSP]
4668 add xSP, 4
4669 ; Merge the status bits into the original MXCSR value.
4670 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4671 and T0_32, X86_MXCSR_XCPT_FLAGS
4672 or T0_32, T1_32
4673 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4674
4675 ldmxcsr [xSP]
4676 add xSP, 4
4677%endmacro
4678
4679
4680;;
4681; Initialize the SSE MXCSR register using the guest value partially to
4682; account for rounding mode.
4683;
4684; @uses 4 bytes of stack to save the original value.
4685; @param 1 Expression giving the address of the FXSTATE of the guest.
4686;
4687%macro AVX_LD_XSAVEAREA_MXCSR 1
4688 sub xSP, 4
4689
4690 stmxcsr [xSP]
4691 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4692 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4693 sub xSP, 4
4694 mov [xSP], T0_32
4695 ldmxcsr [xSP]
4696 add xSP, 4
4697%endmacro
4698
4699
4700;;
4701; Restores the AVX128 MXCSR register with the original value.
4702;
4703; @param 1 Expression giving the address where to return the MXCSR value.
4704;
4705; @note Restores the stack pointer.
4706;
4707%macro AVX128_ST_XSAVEAREA_MXCSR 1
4708 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4709
4710 ldmxcsr [xSP]
4711 add xSP, 4
4712%endmacro
4713
4714
4715;;
4716; Restores the AVX256 MXCSR register with the original value.
4717;
4718; @param 1 Expression giving the address where to return the MXCSR value.
4719;
4720; @note Restores the stack pointer.
4721;
4722%macro AVX256_ST_XSAVEAREA_MXCSR 1
4723 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4724
4725 ldmxcsr [xSP]
4726 add xSP, 4
4727%endmacro
4728
4729
4730;;
4731; Floating point instruction working on two full sized registers.
4732;
4733; @param 1 The instruction
4734; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4735;
4736; @param A0 FPU context (FXSTATE or XSAVEAREA).
4737; @param A1 Where to return the result including the MXCSR value.
4738; @param A2 Pointer to the first media register size operand (input/output).
4739; @param A3 Pointer to the second media register size operand (input).
4740;
4741%macro IEMIMPL_FP_F2 2
4742BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4743 PROLOGUE_4_ARGS
4744 IEMIMPL_SSE_PROLOGUE
4745 SSE_LD_FXSTATE_MXCSR A0
4746
4747 movdqu xmm0, [A2]
4748 movdqu xmm1, [A3]
4749 %1 xmm0, xmm1
4750 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4751
4752 SSE_ST_FXSTATE_MXCSR A1, A0
4753 IEMIMPL_SSE_PROLOGUE
4754 EPILOGUE_4_ARGS
4755ENDPROC iemAImpl_ %+ %1 %+ _u128
4756
4757 %if %2 == 3
4758BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4759 PROLOGUE_4_ARGS
4760 IEMIMPL_AVX_PROLOGUE
4761 AVX_LD_XSAVEAREA_MXCSR A0
4762
4763 vmovdqu xmm0, [A2]
4764 vmovdqu xmm1, [A3]
4765 v %+ %1 xmm0, xmm0, xmm1
4766 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4767
4768 AVX128_ST_XSAVEAREA_MXCSR A1
4769 IEMIMPL_AVX_PROLOGUE
4770 EPILOGUE_4_ARGS
4771ENDPROC iemAImpl_v %+ %1 %+ _u128
4772
4773BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4774 PROLOGUE_4_ARGS
4775 IEMIMPL_AVX_PROLOGUE
4776 AVX_LD_XSAVEAREA_MXCSR A0
4777
4778 vmovdqu ymm0, [A2]
4779 vmovdqu ymm1, [A3]
4780 v %+ %1 ymm0, ymm0, ymm1
4781 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4782
4783 AVX256_ST_XSAVEAREA_MXCSR A1
4784 IEMIMPL_AVX_PROLOGUE
4785 EPILOGUE_4_ARGS
4786ENDPROC iemAImpl_v %+ %1 %+ _u256
4787 %elif %2 == 2
4788BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4789 PROLOGUE_4_ARGS
4790 IEMIMPL_AVX_PROLOGUE
4791 AVX_LD_XSAVEAREA_MXCSR A0
4792
4793 vmovdqu xmm0, [A2]
4794 vmovdqu xmm1, [A3]
4795 v %+ %1 xmm0, xmm1
4796 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4797
4798 AVX128_ST_XSAVEAREA_MXCSR A1
4799 IEMIMPL_AVX_PROLOGUE
4800 EPILOGUE_4_ARGS
4801ENDPROC iemAImpl_v %+ %1 %+ _u128
4802
4803BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4804 PROLOGUE_4_ARGS
4805 IEMIMPL_AVX_PROLOGUE
4806 AVX_LD_XSAVEAREA_MXCSR A0
4807
4808 vmovdqu ymm0, [A2]
4809 vmovdqu ymm1, [A3]
4810 v %+ %1 ymm0, ymm1
4811 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4812
4813 AVX256_ST_XSAVEAREA_MXCSR A1
4814 IEMIMPL_AVX_PROLOGUE
4815 EPILOGUE_4_ARGS
4816ENDPROC iemAImpl_v %+ %1 %+ _u256
4817 %endif
4818%endmacro
4819
4820IEMIMPL_FP_F2 addps, 3
4821IEMIMPL_FP_F2 addpd, 3
4822IEMIMPL_FP_F2 mulps, 3
4823IEMIMPL_FP_F2 mulpd, 3
4824IEMIMPL_FP_F2 subps, 3
4825IEMIMPL_FP_F2 subpd, 3
4826IEMIMPL_FP_F2 minps, 3
4827IEMIMPL_FP_F2 minpd, 3
4828IEMIMPL_FP_F2 divps, 3
4829IEMIMPL_FP_F2 divpd, 3
4830IEMIMPL_FP_F2 maxps, 3
4831IEMIMPL_FP_F2 maxpd, 3
4832IEMIMPL_FP_F2 haddps, 3
4833IEMIMPL_FP_F2 haddpd, 3
4834IEMIMPL_FP_F2 hsubps, 3
4835IEMIMPL_FP_F2 hsubpd, 3
4836IEMIMPL_FP_F2 addsubps, 3
4837IEMIMPL_FP_F2 addsubpd, 3
4838
4839
4840;;
4841; These are actually unary operations but to keep it simple
4842; we treat them as binary for now, so the output result is
4843; always in sync with the register where the result might get written
4844; to.
4845IEMIMPL_FP_F2 sqrtps, 2
4846IEMIMPL_FP_F2 rsqrtps, 2
4847IEMIMPL_FP_F2 sqrtpd, 2
4848IEMIMPL_FP_F2 cvtdq2ps, 2
4849IEMIMPL_FP_F2 cvtps2dq, 2
4850IEMIMPL_FP_F2 cvttps2dq, 2
4851IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4852IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4853IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4854
4855
4856;;
4857; Floating point instruction working on a full sized register and a single precision operand.
4858;
4859; @param 1 The instruction
4860;
4861; @param A0 FPU context (FXSTATE or XSAVEAREA).
4862; @param A1 Where to return the result including the MXCSR value.
4863; @param A2 Pointer to the first media register size operand (input/output).
4864; @param A3 Pointer to the second single precision floating point value (input).
4865;
4866%macro IEMIMPL_FP_F2_R32 1
4867BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4868 PROLOGUE_4_ARGS
4869 IEMIMPL_SSE_PROLOGUE
4870 SSE_LD_FXSTATE_MXCSR A0
4871
4872 movdqu xmm0, [A2]
4873 movd xmm1, [A3]
4874 %1 xmm0, xmm1
4875 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4876
4877 SSE_ST_FXSTATE_MXCSR A1, A0
4878 IEMIMPL_SSE_EPILOGUE
4879 EPILOGUE_4_ARGS
4880ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4881
4882BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4883 PROLOGUE_4_ARGS
4884 IEMIMPL_AVX_PROLOGUE
4885 AVX_LD_XSAVEAREA_MXCSR A0
4886
4887 vmovdqu xmm0, [A2]
4888 vmovd xmm1, [A3]
4889 v %+ %1 xmm0, xmm0, xmm1
4890 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4891
4892 AVX128_ST_XSAVEAREA_MXCSR A1
4893 IEMIMPL_AVX_PROLOGUE
4894 EPILOGUE_4_ARGS
4895ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4896%endmacro
4897
4898IEMIMPL_FP_F2_R32 addss
4899IEMIMPL_FP_F2_R32 mulss
4900IEMIMPL_FP_F2_R32 subss
4901IEMIMPL_FP_F2_R32 minss
4902IEMIMPL_FP_F2_R32 divss
4903IEMIMPL_FP_F2_R32 maxss
4904IEMIMPL_FP_F2_R32 cvtss2sd
4905IEMIMPL_FP_F2_R32 sqrtss
4906IEMIMPL_FP_F2_R32 rsqrtss
4907
4908
4909;;
4910; Floating point instruction working on a full sized register and a double precision operand.
4911;
4912; @param 1 The instruction
4913;
4914; @param A0 FPU context (FXSTATE or XSAVEAREA).
4915; @param A1 Where to return the result including the MXCSR value.
4916; @param A2 Pointer to the first media register size operand (input/output).
4917; @param A3 Pointer to the second double precision floating point value (input).
4918;
4919%macro IEMIMPL_FP_F2_R64 1
4920BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4921 PROLOGUE_4_ARGS
4922 IEMIMPL_SSE_PROLOGUE
4923 SSE_LD_FXSTATE_MXCSR A0
4924
4925 movdqu xmm0, [A2]
4926 movq xmm1, [A3]
4927 %1 xmm0, xmm1
4928 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4929
4930 SSE_ST_FXSTATE_MXCSR A1, A0
4931 IEMIMPL_SSE_EPILOGUE
4932 EPILOGUE_4_ARGS
4933ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4934
4935BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4936 PROLOGUE_4_ARGS
4937 IEMIMPL_AVX_PROLOGUE
4938 AVX_LD_XSAVEAREA_MXCSR A0
4939
4940 vmovdqu xmm0, [A2]
4941 vmovq xmm1, [A3]
4942 v %+ %1 xmm0, xmm0, xmm1
4943 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4944
4945 AVX128_ST_XSAVEAREA_MXCSR A1
4946 IEMIMPL_AVX_EPILOGUE
4947 EPILOGUE_4_ARGS
4948ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4949%endmacro
4950
4951IEMIMPL_FP_F2_R64 addsd
4952IEMIMPL_FP_F2_R64 mulsd
4953IEMIMPL_FP_F2_R64 subsd
4954IEMIMPL_FP_F2_R64 minsd
4955IEMIMPL_FP_F2_R64 divsd
4956IEMIMPL_FP_F2_R64 maxsd
4957IEMIMPL_FP_F2_R64 cvtsd2ss
4958IEMIMPL_FP_F2_R64 sqrtsd
4959
4960
4961;;
4962; Macro for the cvtpd2ps/cvtps2pd instructions.
4963;
4964; 1 The instruction name.
4965; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4966;
4967; @param A0 FPU context (FXSTATE or XSAVEAREA).
4968; @param A1 Where to return the result including the MXCSR value.
4969; @param A2 Pointer to the first media register size operand (input/output).
4970; @param A3 Pointer to the second media register size operand (input).
4971;
4972%macro IEMIMPL_CVT_F2 2
4973BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4974 PROLOGUE_4_ARGS
4975 IEMIMPL_SSE_PROLOGUE
4976 SSE_LD_FXSTATE_MXCSR A0
4977
4978 movdqu xmm0, [A2]
4979 movdqu xmm1, [A3]
4980 %1 xmm0, xmm1
4981 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4982
4983 SSE_ST_FXSTATE_MXCSR A1, A0
4984 IEMIMPL_SSE_EPILOGUE
4985 EPILOGUE_4_ARGS
4986ENDPROC iemAImpl_ %+ %1 %+ _u128
4987
4988BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4989 PROLOGUE_4_ARGS
4990 IEMIMPL_AVX_PROLOGUE
4991 AVX_LD_XSAVEAREA_MXCSR A0
4992
4993 vmovdqu xmm0, [A2]
4994 vmovdqu xmm1, [A3]
4995 v %+ %1 xmm0, xmm1
4996 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4997
4998 AVX128_ST_XSAVEAREA_MXCSR A1
4999 IEMIMPL_AVX_EPILOGUE
5000 EPILOGUE_4_ARGS
5001ENDPROC iemAImpl_v %+ %1 %+ _u128
5002
5003BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5004 PROLOGUE_4_ARGS
5005 IEMIMPL_AVX_PROLOGUE
5006 AVX_LD_XSAVEAREA_MXCSR A0
5007
5008 vmovdqu ymm0, [A2]
5009 vmovdqu ymm1, [A3]
5010 %if %2 == 0
5011 v %+ %1 xmm0, ymm1
5012 %else
5013 v %+ %1 ymm0, xmm1
5014 %endif
5015 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5016
5017 AVX256_ST_XSAVEAREA_MXCSR A1
5018 IEMIMPL_AVX_EPILOGUE
5019 EPILOGUE_4_ARGS
5020ENDPROC iemAImpl_v %+ %1 %+ _u256
5021%endmacro
5022
5023IEMIMPL_CVT_F2 cvtpd2ps, 0
5024IEMIMPL_CVT_F2 cvtps2pd, 1
5025
5026
5027;;
5028; shufps instructions with 8-bit immediates.
5029;
5030; @param A0 Pointer to the destination media register size operand (input/output).
5031; @param A1 Pointer to the first source media register size operand (input).
5032; @param A2 The 8-bit immediate
5033;
5034BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5035 PROLOGUE_3_ARGS
5036 IEMIMPL_SSE_PROLOGUE
5037
5038 movdqu xmm0, [A0]
5039 movdqu xmm1, [A1]
5040 lea T1, [.imm0 xWrtRIP]
5041 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5042 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5043 %else
5044 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5045 %endif
5046 lea T1, [T1 + T0*2]
5047 IBT_NOTRACK
5048 call T1
5049 movdqu [A0], xmm0
5050
5051 IEMIMPL_SSE_EPILOGUE
5052 EPILOGUE_3_ARGS
5053 %assign bImm 0
5054 %rep 256
5055.imm %+ bImm:
5056 IBT_ENDBRxx_WITHOUT_NOTRACK
5057 shufps xmm0, xmm1, bImm
5058 ret
5059 int3
5060 %assign bImm bImm + 1
5061 %endrep
5062.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5063ENDPROC iemAImpl_shufps_u128
5064
5065
5066;;
5067; shufpd instruction with 8-bit immediates.
5068;
5069; @param A0 Pointer to the destination media register size operand (input/output).
5070; @param A1 Pointer to the first source media register size operand (input).
5071; @param A2 The 8-bit immediate
5072;
5073BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5074 PROLOGUE_3_ARGS
5075 IEMIMPL_SSE_PROLOGUE
5076
5077 movdqu xmm0, [A0]
5078 movdqu xmm1, [A1]
5079 lea T1, [.imm0 xWrtRIP]
5080 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5081 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5082 %else
5083 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5084 %endif
5085 lea T1, [T1 + T0*2]
5086 IBT_NOTRACK
5087 call T1
5088 movdqu [A0], xmm0
5089
5090 IEMIMPL_SSE_EPILOGUE
5091 EPILOGUE_3_ARGS
5092 %assign bImm 0
5093 %rep 256
5094.imm %+ bImm:
5095 IBT_ENDBRxx_WITHOUT_NOTRACK
5096 shufpd xmm0, xmm1, bImm
5097 ret
5098 %assign bImm bImm + 1
5099 %endrep
5100.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5101ENDPROC iemAImpl_shufpd_u128
5102
5103
5104;;
5105; vshufp{s,d} instructions with 8-bit immediates.
5106;
5107; @param 1 The instruction name.
5108;
5109; @param A0 Pointer to the destination media register size operand (output).
5110; @param A1 Pointer to the first source media register size operand (input).
5111; @param A2 Pointer to the second source media register size operand (input).
5112; @param A3 The 8-bit immediate
5113;
5114%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5116 PROLOGUE_4_ARGS
5117 IEMIMPL_AVX_PROLOGUE
5118
5119 movdqu xmm0, [A1]
5120 movdqu xmm1, [A2]
5121 lea T1, [.imm0 xWrtRIP]
5122 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5123 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5124 %else
5125 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5126 %endif
5127 lea T1, [T1 + T0*2]
5128 IBT_NOTRACK
5129 call T1
5130 movdqu [A0], xmm0
5131
5132 IEMIMPL_AVX_EPILOGUE
5133 EPILOGUE_4_ARGS
5134 %assign bImm 0
5135 %rep 256
5136.imm %+ bImm:
5137 IBT_ENDBRxx_WITHOUT_NOTRACK
5138 %1 xmm0, xmm0, xmm1, bImm
5139 ret
5140 %assign bImm bImm + 1
5141 %endrep
5142.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5143ENDPROC iemAImpl_ %+ %1 %+ _u128
5144
5145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5146 PROLOGUE_4_ARGS
5147 IEMIMPL_AVX_PROLOGUE
5148
5149 vmovdqu ymm0, [A1]
5150 vmovdqu ymm1, [A2]
5151 lea T1, [.imm0 xWrtRIP]
5152 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5153 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5154 %else
5155 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5156 %endif
5157 lea T1, [T1 + T0*2]
5158 IBT_NOTRACK
5159 call T1
5160 vmovdqu [A0], ymm0
5161
5162 IEMIMPL_AVX_EPILOGUE
5163 EPILOGUE_4_ARGS
5164 %assign bImm 0
5165 %rep 256
5166.imm %+ bImm:
5167 IBT_ENDBRxx_WITHOUT_NOTRACK
5168 %1 ymm0, ymm0, ymm1, bImm
5169 ret
5170 %assign bImm bImm + 1
5171 %endrep
5172.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5173ENDPROC iemAImpl_ %+ %1 %+ _u256
5174%endmacro
5175
5176IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5177IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5178
5179
5180;;
5181; One of the [p]blendv{b,ps,pd} variants
5182;
5183; @param 1 The instruction
5184;
5185; @param A0 Pointer to the first media register sized operand (input/output).
5186; @param A1 Pointer to the second media sized value (input).
5187; @param A2 Pointer to the media register sized mask value (input).
5188;
5189%macro IEMIMPL_P_BLEND 1
5190BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5191 PROLOGUE_3_ARGS
5192 IEMIMPL_SSE_PROLOGUE
5193
5194 movdqu xmm0, [A2] ; This is implicit
5195 movdqu xmm1, [A0]
5196 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5197 %1 xmm1, xmm2
5198 movdqu [A0], xmm1
5199
5200 IEMIMPL_SSE_PROLOGUE
5201 EPILOGUE_3_ARGS
5202ENDPROC iemAImpl_ %+ %1 %+ _u128
5203%endmacro
5204
5205IEMIMPL_P_BLEND pblendvb
5206IEMIMPL_P_BLEND blendvps
5207IEMIMPL_P_BLEND blendvpd
5208
5209
5210;;
5211; One of the v[p]blendv{b,ps,pd} variants
5212;
5213; @param 1 The instruction
5214;
5215; @param A0 Pointer to the first media register sized operand (output).
5216; @param A1 Pointer to the first media register sized operand (input).
5217; @param A2 Pointer to the second media register sized operand (input).
5218; @param A3 Pointer to the media register sized mask value (input).
5219%macro IEMIMPL_AVX_P_BLEND 1
5220BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5221 PROLOGUE_4_ARGS
5222 IEMIMPL_AVX_PROLOGUE
5223
5224 vmovdqu xmm0, [A1]
5225 vmovdqu xmm1, [A2]
5226 vmovdqu xmm2, [A3]
5227 %1 xmm0, xmm0, xmm1, xmm2
5228 vmovdqu [A0], xmm0
5229
5230 IEMIMPL_AVX_PROLOGUE
5231 EPILOGUE_4_ARGS
5232ENDPROC iemAImpl_ %+ %1 %+ _u128
5233
5234BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5235 PROLOGUE_4_ARGS
5236 IEMIMPL_AVX_PROLOGUE
5237
5238 vmovdqu ymm0, [A1]
5239 vmovdqu ymm1, [A2]
5240 vmovdqu ymm2, [A3]
5241 %1 ymm0, ymm0, ymm1, ymm2
5242 vmovdqu [A0], ymm0
5243
5244 IEMIMPL_AVX_PROLOGUE
5245 EPILOGUE_4_ARGS
5246ENDPROC iemAImpl_ %+ %1 %+ _u256
5247%endmacro
5248
5249IEMIMPL_AVX_P_BLEND vpblendvb
5250IEMIMPL_AVX_P_BLEND vblendvps
5251IEMIMPL_AVX_P_BLEND vblendvpd
5252
5253
5254;;
5255; palignr mm1, mm2/m64 instruction.
5256;
5257; @param A0 Pointer to the first media register sized operand (output).
5258; @param A1 The second register sized operand (input).
5259; @param A2 The 8-bit immediate.
5260BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5261 PROLOGUE_3_ARGS
5262 IEMIMPL_MMX_PROLOGUE
5263
5264 movq mm0, [A0]
5265 movq mm1, A1
5266 lea T1, [.imm0 xWrtRIP]
5267 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5268 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5269 %else
5270 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5271 %endif
5272 lea T1, [T1 + T0*2]
5273 IBT_NOTRACK
5274 call T1
5275 movq [A0], mm0
5276
5277 IEMIMPL_MMX_EPILOGUE
5278 EPILOGUE_3_ARGS
5279 %assign bImm 0
5280 %rep 256
5281.imm %+ bImm:
5282 IBT_ENDBRxx_WITHOUT_NOTRACK
5283 palignr mm0, mm1, bImm
5284 ret
5285 %assign bImm bImm + 1
5286 %endrep
5287.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5288ENDPROC iemAImpl_palignr_u64
5289
5290
5291;;
5292; SSE instructions with 8-bit immediates of the form
5293; xxx xmm1, xmm2, imm8.
5294; where the instruction encoding takes up 6 bytes.
5295;
5296; @param 1 The instruction name.
5297;
5298; @param A0 Pointer to the first media register size operand (input/output).
5299; @param A1 Pointer to the second source media register size operand (input).
5300; @param A2 The 8-bit immediate
5301;
5302%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5303BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5304 PROLOGUE_3_ARGS
5305 IEMIMPL_SSE_PROLOGUE
5306
5307 movdqu xmm0, [A0]
5308 movdqu xmm1, [A1]
5309 lea T1, [.imm0 xWrtRIP]
5310 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5311 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5312 lea T1, [T1 + T0*4]
5313 %else
5314 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5315 %endif
5316 IBT_NOTRACK
5317 call T1
5318 movdqu [A0], xmm0
5319
5320 IEMIMPL_SSE_EPILOGUE
5321 EPILOGUE_3_ARGS
5322 %assign bImm 0
5323 %rep 256
5324.imm %+ bImm:
5325 IBT_ENDBRxx_WITHOUT_NOTRACK
5326 %1 xmm0, xmm1, bImm
5327 ret
5328 int3
5329 %assign bImm bImm + 1
5330 %endrep
5331.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5332ENDPROC iemAImpl_ %+ %1 %+ _u128
5333%endmacro
5334
5335IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5336IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5337IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5338IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5339IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5340IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5341IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5342
5343
5344;;
5345; AVX instructions with 8-bit immediates of the form
5346; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5347; where the instruction encoding takes up 6 bytes.
5348;
5349; @param 1 The instruction name.
5350; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5351; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5352;
5353; @param A0 Pointer to the destination media register size operand (output).
5354; @param A1 Pointer to the first source media register size operand (input).
5355; @param A2 Pointer to the second source media register size operand (input).
5356; @param A3 The 8-bit immediate
5357;
5358%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5359 %if %2 == 1
5360BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5361 PROLOGUE_4_ARGS
5362 IEMIMPL_AVX_PROLOGUE
5363
5364 movdqu xmm0, [A1]
5365 movdqu xmm1, [A2]
5366 lea T1, [.imm0 xWrtRIP]
5367 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5368 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5369 lea T1, [T1 + T0*4]
5370 %else
5371 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5372 %endif
5373 IBT_NOTRACK
5374 call T1
5375 movdqu [A0], xmm0
5376
5377 IEMIMPL_AVX_EPILOGUE
5378 EPILOGUE_4_ARGS
5379 %assign bImm 0
5380 %rep 256
5381.imm %+ bImm:
5382 IBT_ENDBRxx_WITHOUT_NOTRACK
5383 %1 xmm0, xmm0, xmm1, bImm
5384 ret
5385 int3
5386 %assign bImm bImm + 1
5387 %endrep
5388.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5389ENDPROC iemAImpl_ %+ %1 %+ _u128
5390 %endif
5391
5392 %if %3 == 1
5393BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5394 PROLOGUE_4_ARGS
5395 IEMIMPL_AVX_PROLOGUE
5396
5397 vmovdqu ymm0, [A1]
5398 vmovdqu ymm1, [A2]
5399 lea T1, [.imm0 xWrtRIP]
5400 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5401 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5402 lea T1, [T1 + T0*4]
5403 %else
5404 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5405 %endif
5406 IBT_NOTRACK
5407 call T1
5408 vmovdqu [A0], ymm0
5409
5410 IEMIMPL_AVX_EPILOGUE
5411 EPILOGUE_4_ARGS
5412 %assign bImm 0
5413 %rep 256
5414.imm %+ bImm:
5415 IBT_ENDBRxx_WITHOUT_NOTRACK
5416 %1 ymm0, ymm0, ymm1, bImm
5417 ret
5418 int3
5419 %assign bImm bImm + 1
5420 %endrep
5421.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5422ENDPROC iemAImpl_ %+ %1 %+ _u256
5423 %endif
5424%endmacro
5425
5426IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5427IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5428IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5429IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5430IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5431IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5432IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5433
5434
5435;;
5436; Need to move this as well somewhere better?
5437;
5438struc IEMPCMPISTRXSRC
5439 .uSrc1 resd 4
5440 .uSrc2 resd 4
5441endstruc
5442
5443struc IEMPCMPESTRXSRC
5444 .uSrc1 resd 4
5445 .uSrc2 resd 4
5446 .u64Rax resd 2
5447 .u64Rdx resd 2
5448endstruc
5449
5450;;
5451; The pcmpistri instruction.
5452;
5453; @param A0 Pointer to the ECX register to store the result to (output).
5454; @param A1 Pointer to the EFLAGS register.
5455; @param A2 Pointer to the structure containing the source operands (input).
5456; @param A3 The 8-bit immediate
5457;
5458BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5459 PROLOGUE_4_ARGS
5460 IEMIMPL_SSE_PROLOGUE
5461
5462 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5463 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5464 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5465 lea T1, [.imm0 xWrtRIP]
5466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5467 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5468 lea T1, [T1 + T0*4]
5469 %else
5470 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5471 %endif
5472 IBT_NOTRACK
5473 call T1
5474
5475 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5476 mov [T2], ecx
5477
5478 IEMIMPL_SSE_EPILOGUE
5479 EPILOGUE_4_ARGS
5480 %assign bImm 0
5481 %rep 256
5482.imm %+ bImm:
5483 IBT_ENDBRxx_WITHOUT_NOTRACK
5484 pcmpistri xmm0, xmm1, bImm
5485 ret
5486 int3
5487 %assign bImm bImm + 1
5488 %endrep
5489.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5490ENDPROC iemAImpl_pcmpistri_u128
5491
5492;;
5493; The pcmpestri instruction.
5494;
5495; @param A0 Pointer to the ECX register to store the result to (output).
5496; @param A1 Pointer to the EFLAGS register.
5497; @param A2 Pointer to the structure containing the source operands (input).
5498; @param A3 The 8-bit immediate
5499;
5500BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5501 PROLOGUE_4_ARGS
5502 IEMIMPL_SSE_PROLOGUE
5503
5504 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5505 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5506 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5507 lea T1, [.imm0 xWrtRIP]
5508 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5509 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5510 lea T1, [T1 + T0*4]
5511 %else
5512 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5513 %endif
5514 push xDX ; xDX can be A1 or A2 depending on the calling convention
5515 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5516 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5517 IBT_NOTRACK
5518 call T1
5519
5520 pop xDX
5521 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5522 mov [T2], ecx
5523
5524 IEMIMPL_SSE_EPILOGUE
5525 EPILOGUE_4_ARGS
5526 %assign bImm 0
5527 %rep 256
5528.imm %+ bImm:
5529 IBT_ENDBRxx_WITHOUT_NOTRACK
5530 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5531 pcmpestri xmm0, xmm1, bImm
5532 ret
5533 %assign bImm bImm + 1
5534 %endrep
5535.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5536ENDPROC iemAImpl_pcmpestri_u128
5537
5538;;
5539; The pcmpistrm instruction template.
5540;
5541; @param A0 Pointer to the XMM0 register to store the result to (output).
5542; @param A1 Pointer to the EFLAGS register.
5543; @param A2 Pointer to the structure containing the source operands (input).
5544; @param A3 The 8-bit immediate
5545;
5546BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5547 PROLOGUE_4_ARGS
5548 IEMIMPL_SSE_PROLOGUE
5549
5550 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5551 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5552 lea T1, [.imm0 xWrtRIP]
5553 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5554 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5555 lea T1, [T1 + T0*4]
5556 %else
5557 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5558 %endif
5559 IBT_NOTRACK
5560 call T1
5561
5562 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5563 movdqu [A0], xmm0
5564
5565 IEMIMPL_SSE_EPILOGUE
5566 EPILOGUE_4_ARGS
5567 %assign bImm 0
5568 %rep 256
5569.imm %+ bImm:
5570 IBT_ENDBRxx_WITHOUT_NOTRACK
5571 pcmpistrm xmm1, xmm2, bImm
5572 ret
5573 int3
5574 %assign bImm bImm + 1
5575 %endrep
5576.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5577ENDPROC iemAImpl_pcmpistrm_u128
5578
5579;;
5580; The pcmpestrm instruction template.
5581;
5582; @param A0 Pointer to the XMM0 register to store the result to (output).
5583; @param A1 Pointer to the EFLAGS register.
5584; @param A2 Pointer to the structure containing the source operands (input).
5585; @param A3 The 8-bit immediate
5586;
5587BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5588 PROLOGUE_4_ARGS
5589 IEMIMPL_SSE_PROLOGUE
5590
5591 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5592 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5593 lea T1, [.imm0 xWrtRIP]
5594 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5595 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5596 lea T1, [T1 + T0*4]
5597 %else
5598 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5599 %endif
5600 push xDX ; xDX can be A1 or A2 depending on the calling convention
5601 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5602 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5603 IBT_NOTRACK
5604 call T1
5605
5606 pop xDX
5607 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5608 movdqu [A0], xmm0
5609
5610 IEMIMPL_SSE_EPILOGUE
5611 EPILOGUE_4_ARGS
5612 %assign bImm 0
5613 %rep 256
5614.imm %+ bImm:
5615 IBT_ENDBRxx_WITHOUT_NOTRACK
5616 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5617 pcmpestrm xmm1, xmm2, bImm
5618 ret
5619 %assign bImm bImm + 1
5620 %endrep
5621.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5622ENDPROC iemAImpl_pcmpestrm_u128
5623
5624
5625;;
5626; pinsrw instruction.
5627;
5628; @param A0 Pointer to the first media register size operand (input/output).
5629; @param A1 The 16 bit input operand (input).
5630; @param A2 The 8-bit immediate
5631;
5632BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5633 PROLOGUE_3_ARGS
5634 IEMIMPL_SSE_PROLOGUE
5635
5636 movq mm0, [A0]
5637 lea T1, [.imm0 xWrtRIP]
5638 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5639 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5640 %else
5641 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5642 %endif
5643 lea T1, [T1 + T0]
5644 IBT_NOTRACK
5645 call T1
5646 movq [A0], mm0
5647
5648 IEMIMPL_SSE_EPILOGUE
5649 EPILOGUE_3_ARGS
5650 %assign bImm 0
5651 %rep 256
5652.imm %+ bImm:
5653 IBT_ENDBRxx_WITHOUT_NOTRACK
5654 pinsrw mm0, A1_32, bImm
5655 ret
5656 %assign bImm bImm + 1
5657 %endrep
5658.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5659ENDPROC iemAImpl_pinsrw_u64
5660
5661BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5662 PROLOGUE_3_ARGS
5663 IEMIMPL_SSE_PROLOGUE
5664
5665 movdqu xmm0, [A0]
5666 lea T1, [.imm0 xWrtRIP]
5667 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5668 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5669 %else
5670 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5671 %endif
5672 lea T1, [T1 + T0*2]
5673 IBT_NOTRACK
5674 call T1
5675 movdqu [A0], xmm0
5676
5677 IEMIMPL_SSE_EPILOGUE
5678 EPILOGUE_3_ARGS
5679 %assign bImm 0
5680 %rep 256
5681.imm %+ bImm:
5682 IBT_ENDBRxx_WITHOUT_NOTRACK
5683 pinsrw xmm0, A1_32, bImm
5684 ret
5685 %assign bImm bImm + 1
5686 %endrep
5687.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5688ENDPROC iemAImpl_pinsrw_u128
5689
5690;;
5691; vpinsrw instruction.
5692;
5693; @param A0 Pointer to the first media register size operand (output).
5694; @param A1 Pointer to the source media register size operand (input).
5695; @param A2 The 16 bit input operand (input).
5696; @param A3 The 8-bit immediate
5697;
5698BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5699 PROLOGUE_4_ARGS
5700 IEMIMPL_SSE_PROLOGUE
5701
5702 movdqu xmm0, [A1]
5703 lea T1, [.imm0 xWrtRIP]
5704 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5705 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5706 %else
5707 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5708 %endif
5709 lea T1, [T1 + T0*2]
5710 mov A1, A2 ; A2 requires longer encoding on Windows
5711 IBT_NOTRACK
5712 call T1
5713 movdqu [A0], xmm0
5714
5715 IEMIMPL_SSE_EPILOGUE
5716 EPILOGUE_4_ARGS
5717 %assign bImm 0
5718 %rep 256
5719.imm %+ bImm:
5720 IBT_ENDBRxx_WITHOUT_NOTRACK
5721 vpinsrw xmm0, xmm0, A1_32, bImm
5722 ret
5723 %assign bImm bImm + 1
5724 %endrep
5725.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5726ENDPROC iemAImpl_vpinsrw_u128
5727
5728
5729;;
5730; pextrw instruction.
5731;
5732; @param A0 Pointer to the 16bit output operand (output).
5733; @param A1 Pointer to the media register size operand (input).
5734; @param A2 The 8-bit immediate
5735;
5736BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5737 PROLOGUE_3_ARGS
5738 IEMIMPL_SSE_PROLOGUE
5739
5740 movq mm0, A1
5741 lea T1, [.imm0 xWrtRIP]
5742 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5743 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5744 %else
5745 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5746 %endif
5747 lea T1, [T1 + T0]
5748 IBT_NOTRACK
5749 call T1
5750 mov word [A0], T0_16
5751
5752 IEMIMPL_SSE_EPILOGUE
5753 EPILOGUE_3_ARGS
5754 %assign bImm 0
5755 %rep 256
5756.imm %+ bImm:
5757 IBT_ENDBRxx_WITHOUT_NOTRACK
5758 pextrw T0_32, mm0, bImm
5759 ret
5760 %assign bImm bImm + 1
5761 %endrep
5762.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5763ENDPROC iemAImpl_pextrw_u64
5764
5765BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5766 PROLOGUE_3_ARGS
5767 IEMIMPL_SSE_PROLOGUE
5768
5769 movdqu xmm0, [A1]
5770 lea T1, [.imm0 xWrtRIP]
5771 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5772 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5773 %else
5774 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5775 %endif
5776 lea T1, [T1 + T0*2]
5777 IBT_NOTRACK
5778 call T1
5779 mov word [A0], T0_16
5780
5781 IEMIMPL_SSE_EPILOGUE
5782 EPILOGUE_3_ARGS
5783 %assign bImm 0
5784 %rep 256
5785.imm %+ bImm:
5786 IBT_ENDBRxx_WITHOUT_NOTRACK
5787 pextrw T0_32, xmm0, bImm
5788 ret
5789 %assign bImm bImm + 1
5790 %endrep
5791.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5792ENDPROC iemAImpl_pextrw_u128
5793
5794;;
5795; vpextrw instruction.
5796;
5797; @param A0 Pointer to the 16bit output operand (output).
5798; @param A1 Pointer to the source media register size operand (input).
5799; @param A2 The 8-bit immediate
5800;
5801BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5802 PROLOGUE_3_ARGS
5803 IEMIMPL_SSE_PROLOGUE
5804
5805 movdqu xmm0, [A1]
5806 lea T1, [.imm0 xWrtRIP]
5807 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5808 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5809 %else
5810 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5811 %endif
5812 lea T1, [T1 + T0*2]
5813 IBT_NOTRACK
5814 call T1
5815 mov word [A0], T0_16
5816
5817 IEMIMPL_SSE_EPILOGUE
5818 EPILOGUE_3_ARGS
5819 %assign bImm 0
5820 %rep 256
5821.imm %+ bImm:
5822 IBT_ENDBRxx_WITHOUT_NOTRACK
5823 vpextrw T0_32, xmm0, bImm
5824 ret
5825 %assign bImm bImm + 1
5826 %endrep
5827.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5828ENDPROC iemAImpl_vpextrw_u128
5829
5830
5831;;
5832; movmskp{s,d} SSE instruction template
5833;
5834; @param 1 The SSE instruction name.
5835; @param 2 The AVX instruction name.
5836;
5837; @param A0 Pointer to the output register (output/byte sized).
5838; @param A1 Pointer to the source media register size operand (input).
5839;
5840%macro IEMIMPL_MEDIA_MOVMSK_P 2
5841BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5842 PROLOGUE_2_ARGS
5843 IEMIMPL_SSE_PROLOGUE
5844
5845 movdqu xmm0, [A1]
5846 %1 T0, xmm0
5847 mov byte [A0], T0_8
5848
5849 IEMIMPL_SSE_EPILOGUE
5850 EPILOGUE_2_ARGS
5851ENDPROC iemAImpl_ %+ %1 %+ _u128
5852
5853BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5854 PROLOGUE_2_ARGS
5855 IEMIMPL_AVX_PROLOGUE
5856
5857 movdqu xmm0, [A1]
5858 %2 T0, xmm0
5859 mov byte [A0], T0_8
5860
5861 IEMIMPL_AVX_EPILOGUE
5862 EPILOGUE_2_ARGS
5863ENDPROC iemAImpl_ %+ %2 %+ _u128
5864
5865BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5866 PROLOGUE_2_ARGS
5867 IEMIMPL_AVX_PROLOGUE
5868
5869 vmovdqu ymm0, [A1]
5870 %2 T0, ymm0
5871 mov byte [A0], T0_8
5872
5873 IEMIMPL_AVX_EPILOGUE
5874 EPILOGUE_2_ARGS
5875ENDPROC iemAImpl_ %+ %2 %+ _u256
5876%endmacro
5877
5878IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5879IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5880
5881
5882;;
5883; Restores the SSE MXCSR register with the original value.
5884;
5885; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5886; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5887; @param 2 Expression giving the address of the FXSTATE of the guest.
5888;
5889; @note Restores the stack pointer.
5890;
5891%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5892 sub xSP, 4
5893 stmxcsr [xSP]
5894 mov T0_32, [xSP]
5895 add xSP, 4
5896 ; Merge the status bits into the original MXCSR value.
5897 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5898 and T0_32, X86_MXCSR_XCPT_FLAGS
5899 or T0_32, T1_32
5900 mov [%1], T0_32
5901
5902 ldmxcsr [xSP]
5903 add xSP, 4
5904%endmacro
5905
5906
5907;;
5908; cvttsd2si instruction - 32-bit variant.
5909;
5910; @param A0 FPU context (FXSTATE or XSAVEAREA).
5911; @param A1 Where to return the MXCSR value.
5912; @param A2 Pointer to the result operand (output).
5913; @param A3 Pointer to the second operand (input).
5914;
5915BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5916 PROLOGUE_4_ARGS
5917 IEMIMPL_SSE_PROLOGUE
5918 SSE_LD_FXSTATE_MXCSR A0
5919
5920 cvttsd2si T0_32, [A3]
5921 mov dword [A2], T0_32
5922
5923 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5924 IEMIMPL_SSE_EPILOGUE
5925 EPILOGUE_4_ARGS
5926ENDPROC iemAImpl_cvttsd2si_i32_r64
5927
5928;;
5929; cvttsd2si instruction - 64-bit variant.
5930;
5931; @param A0 FPU context (FXSTATE or XSAVEAREA).
5932; @param A1 Where to return the MXCSR value.
5933; @param A2 Pointer to the result operand (output).
5934; @param A3 Pointer to the second operand (input).
5935;
5936BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5937 PROLOGUE_4_ARGS
5938 IEMIMPL_SSE_PROLOGUE
5939 SSE_LD_FXSTATE_MXCSR A0
5940
5941 cvttsd2si T0, [A3]
5942 mov qword [A2], T0
5943
5944 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5945 IEMIMPL_SSE_EPILOGUE
5946 EPILOGUE_4_ARGS
5947ENDPROC iemAImpl_cvttsd2si_i64_r64
5948
5949
5950;;
5951; cvtsd2si instruction - 32-bit variant.
5952;
5953; @param A0 FPU context (FXSTATE or XSAVEAREA).
5954; @param A1 Where to return the MXCSR value.
5955; @param A2 Pointer to the result operand (output).
5956; @param A3 Pointer to the second operand (input).
5957;
5958BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5959 PROLOGUE_4_ARGS
5960 IEMIMPL_SSE_PROLOGUE
5961 SSE_LD_FXSTATE_MXCSR A0
5962
5963 cvtsd2si T0_32, [A3]
5964 mov dword [A2], T0_32
5965
5966 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5967 IEMIMPL_SSE_EPILOGUE
5968 EPILOGUE_4_ARGS
5969ENDPROC iemAImpl_cvtsd2si_i32_r64
5970
5971;;
5972; cvtsd2si instruction - 64-bit variant.
5973;
5974; @param A0 FPU context (FXSTATE or XSAVEAREA).
5975; @param A1 Where to return the MXCSR value.
5976; @param A2 Pointer to the result operand (output).
5977; @param A3 Pointer to the second operand (input).
5978;
5979BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5980 PROLOGUE_4_ARGS
5981 IEMIMPL_SSE_PROLOGUE
5982 SSE_LD_FXSTATE_MXCSR A0
5983
5984 cvtsd2si T0, [A3]
5985 mov qword [A2], T0
5986
5987 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5988 IEMIMPL_SSE_EPILOGUE
5989 EPILOGUE_4_ARGS
5990ENDPROC iemAImpl_cvtsd2si_i64_r64
5991
5992
5993;;
5994; cvttss2si instruction - 32-bit variant.
5995;
5996; @param A0 FPU context (FXSTATE or XSAVEAREA).
5997; @param A1 Where to return the MXCSR value.
5998; @param A2 Pointer to the result operand (output).
5999; @param A3 Pointer to the second operand (input).
6000;
6001BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6002 PROLOGUE_4_ARGS
6003 IEMIMPL_SSE_PROLOGUE
6004 SSE_LD_FXSTATE_MXCSR A0
6005
6006 cvttss2si T0_32, [A3]
6007 mov dword [A2], T0_32
6008
6009 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6010 IEMIMPL_SSE_EPILOGUE
6011 EPILOGUE_4_ARGS
6012ENDPROC iemAImpl_cvttss2si_i32_r32
6013
6014;;
6015; cvttss2si instruction - 64-bit variant.
6016;
6017; @param A0 FPU context (FXSTATE or XSAVEAREA).
6018; @param A1 Where to return the MXCSR value.
6019; @param A2 Pointer to the result operand (output).
6020; @param A3 Pointer to the second operand (input).
6021;
6022BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6023 PROLOGUE_4_ARGS
6024 IEMIMPL_SSE_PROLOGUE
6025 SSE_LD_FXSTATE_MXCSR A0
6026
6027 cvttss2si T0, [A3]
6028 mov qword [A2], T0
6029
6030 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6031 IEMIMPL_SSE_EPILOGUE
6032 EPILOGUE_4_ARGS
6033ENDPROC iemAImpl_cvttss2si_i64_r32
6034
6035
6036;;
6037; cvtss2si instruction - 32-bit variant.
6038;
6039; @param A0 FPU context (FXSTATE or XSAVEAREA).
6040; @param A1 Where to return the MXCSR value.
6041; @param A2 Pointer to the result operand (output).
6042; @param A3 Pointer to the second operand (input).
6043;
6044BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6045 PROLOGUE_4_ARGS
6046 IEMIMPL_SSE_PROLOGUE
6047 SSE_LD_FXSTATE_MXCSR A0
6048
6049 cvtss2si T0_32, [A3]
6050 mov dword [A2], T0_32
6051
6052 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6053 IEMIMPL_SSE_EPILOGUE
6054 EPILOGUE_4_ARGS
6055ENDPROC iemAImpl_cvtss2si_i32_r32
6056
6057;;
6058; cvtss2si instruction - 64-bit variant.
6059;
6060; @param A0 FPU context (FXSTATE or XSAVEAREA).
6061; @param A1 Where to return the MXCSR value.
6062; @param A2 Pointer to the result operand (output).
6063; @param A3 Pointer to the second operand (input).
6064;
6065BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6066 PROLOGUE_4_ARGS
6067 IEMIMPL_SSE_PROLOGUE
6068 SSE_LD_FXSTATE_MXCSR A0
6069
6070 cvtss2si T0, [A3]
6071 mov qword [A2], T0
6072
6073 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6074 IEMIMPL_SSE_EPILOGUE
6075 EPILOGUE_4_ARGS
6076ENDPROC iemAImpl_cvtss2si_i64_r32
6077
6078
6079;;
6080; cvtsi2ss instruction - 32-bit variant.
6081;
6082; @param A0 FPU context (FXSTATE or XSAVEAREA).
6083; @param A1 Where to return the MXCSR value.
6084; @param A2 Pointer to the result operand (output).
6085; @param A3 Pointer to the second operand (input).
6086;
6087BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6088 PROLOGUE_4_ARGS
6089 IEMIMPL_SSE_PROLOGUE
6090 SSE_LD_FXSTATE_MXCSR A0
6091
6092 cvtsi2ss xmm0, dword [A3]
6093 movd dword [A2], xmm0
6094
6095 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6096 IEMIMPL_SSE_EPILOGUE
6097 EPILOGUE_4_ARGS
6098ENDPROC iemAImpl_cvtsi2ss_r32_i32
6099
6100;;
6101; cvtsi2ss instruction - 64-bit variant.
6102;
6103; @param A0 FPU context (FXSTATE or XSAVEAREA).
6104; @param A1 Where to return the MXCSR value.
6105; @param A2 Pointer to the result operand (output).
6106; @param A3 Pointer to the second operand (input).
6107;
6108BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6109 PROLOGUE_4_ARGS
6110 IEMIMPL_SSE_PROLOGUE
6111 SSE_LD_FXSTATE_MXCSR A0
6112
6113 cvtsi2ss xmm0, qword [A3]
6114 movd dword [A2], xmm0
6115
6116 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6117 IEMIMPL_SSE_EPILOGUE
6118 EPILOGUE_4_ARGS
6119ENDPROC iemAImpl_cvtsi2ss_r32_i64
6120
6121
6122;;
6123; cvtsi2sd instruction - 32-bit variant.
6124;
6125; @param A0 FPU context (FXSTATE or XSAVEAREA).
6126; @param A1 Where to return the MXCSR value.
6127; @param A2 Pointer to the result operand (output).
6128; @param A3 Pointer to the second operand (input).
6129;
6130BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6131 PROLOGUE_4_ARGS
6132 IEMIMPL_SSE_PROLOGUE
6133 SSE_LD_FXSTATE_MXCSR A0
6134
6135 cvtsi2sd xmm0, dword [A3]
6136 movq [A2], xmm0
6137
6138 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6139 IEMIMPL_SSE_EPILOGUE
6140 EPILOGUE_4_ARGS
6141ENDPROC iemAImpl_cvtsi2sd_r64_i32
6142
6143;;
6144; cvtsi2sd instruction - 64-bit variant.
6145;
6146; @param A0 FPU context (FXSTATE or XSAVEAREA).
6147; @param A1 Where to return the MXCSR value.
6148; @param A2 Pointer to the result operand (output).
6149; @param A3 Pointer to the second operand (input).
6150;
6151BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6152 PROLOGUE_4_ARGS
6153 IEMIMPL_SSE_PROLOGUE
6154 SSE_LD_FXSTATE_MXCSR A0
6155
6156 cvtsi2sd xmm0, qword [A3]
6157 movq [A2], xmm0
6158
6159 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6160 IEMIMPL_SSE_EPILOGUE
6161 EPILOGUE_4_ARGS
6162ENDPROC iemAImpl_cvtsi2sd_r64_i64
6163
6164
6165;;
6166; Initialize the SSE MXCSR register using the guest value partially to
6167; account for rounding mode.
6168;
6169; @uses 4 bytes of stack to save the original value, T0.
6170; @param 1 Expression giving the address of the MXCSR register of the guest.
6171;
6172%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6173 sub xSP, 4
6174
6175 stmxcsr [xSP]
6176 mov T0_32, [%1]
6177 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6178 or T0_32, X86_MXCSR_XCPT_MASK
6179 sub xSP, 4
6180 mov [xSP], T0_32
6181 ldmxcsr [xSP]
6182 add xSP, 4
6183%endmacro
6184
6185
6186;;
6187; Restores the SSE MXCSR register with the original value.
6188;
6189; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6190; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6191;
6192; @note Restores the stack pointer.
6193;
6194%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6195 sub xSP, 4
6196 stmxcsr [xSP]
6197 mov T0_32, [xSP]
6198 add xSP, 4
6199 ; Merge the status bits into the original MXCSR value.
6200 mov T1_32, [%1]
6201 and T0_32, X86_MXCSR_XCPT_FLAGS
6202 or T0_32, T1_32
6203 mov [%1], T0_32
6204
6205 ldmxcsr [xSP]
6206 add xSP, 4
6207%endmacro
6208
6209
6210;
6211; UCOMISS (SSE)
6212;
6213; @param A0 Pointer to the MXCSR value (input/output).
6214; @param A1 Pointer to the EFLAGS value (input/output).
6215; @param A2 Pointer to the first source operand (aka readonly destination).
6216; @param A3 Pointer to the second source operand.
6217;
6218BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6219 PROLOGUE_4_ARGS
6220 IEMIMPL_SSE_PROLOGUE
6221 SSE_LD_FXSTATE_MXCSR_ONLY A0
6222
6223 movdqu xmm0, [A2]
6224 movdqu xmm1, [A3]
6225 ucomiss xmm0, xmm1
6226 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6227
6228 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6229 IEMIMPL_SSE_EPILOGUE
6230 EPILOGUE_4_ARGS
6231ENDPROC iemAImpl_ucomiss_u128
6232
6233BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6234 PROLOGUE_4_ARGS
6235 IEMIMPL_SSE_PROLOGUE
6236 SSE_LD_FXSTATE_MXCSR_ONLY A0
6237
6238 movdqu xmm0, [A2]
6239 movdqu xmm1, [A3]
6240 vucomiss xmm0, xmm1
6241 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6242
6243 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6244 IEMIMPL_SSE_EPILOGUE
6245 EPILOGUE_4_ARGS
6246ENDPROC iemAImpl_vucomiss_u128
6247
6248
6249;
6250; UCOMISD (SSE)
6251;
6252; @param A0 Pointer to the MXCSR value (input/output).
6253; @param A1 Pointer to the EFLAGS value (input/output).
6254; @param A2 Pointer to the first source operand (aka readonly destination).
6255; @param A3 Pointer to the second source operand.
6256;
6257BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6258 PROLOGUE_4_ARGS
6259 IEMIMPL_SSE_PROLOGUE
6260 SSE_LD_FXSTATE_MXCSR_ONLY A0
6261
6262 movdqu xmm0, [A2]
6263 movdqu xmm1, [A3]
6264 ucomisd xmm0, xmm1
6265 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6266
6267 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6268 IEMIMPL_SSE_EPILOGUE
6269 EPILOGUE_4_ARGS
6270ENDPROC iemAImpl_ucomisd_u128
6271
6272BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6273 PROLOGUE_4_ARGS
6274 IEMIMPL_SSE_PROLOGUE
6275 SSE_LD_FXSTATE_MXCSR_ONLY A0
6276
6277 movdqu xmm0, [A2]
6278 movdqu xmm1, [A3]
6279 vucomisd xmm0, xmm1
6280 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6281
6282 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6283 IEMIMPL_SSE_EPILOGUE
6284 EPILOGUE_4_ARGS
6285ENDPROC iemAImpl_vucomisd_u128
6286
6287;
6288; COMISS (SSE)
6289;
6290; @param A0 Pointer to the MXCSR value (input/output).
6291; @param A1 Pointer to the EFLAGS value (input/output).
6292; @param A2 Pointer to the first source operand (aka readonly destination).
6293; @param A3 Pointer to the second source operand.
6294;
6295BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6296 PROLOGUE_4_ARGS
6297 IEMIMPL_SSE_PROLOGUE
6298 SSE_LD_FXSTATE_MXCSR_ONLY A0
6299
6300 movdqu xmm0, [A2]
6301 movdqu xmm1, [A3]
6302 comiss xmm0, xmm1
6303 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6304
6305 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6306 IEMIMPL_SSE_EPILOGUE
6307 EPILOGUE_4_ARGS
6308ENDPROC iemAImpl_comiss_u128
6309
6310BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6311 PROLOGUE_4_ARGS
6312 IEMIMPL_SSE_PROLOGUE
6313 SSE_LD_FXSTATE_MXCSR_ONLY A0
6314
6315 movdqu xmm0, [A2]
6316 movdqu xmm1, [A3]
6317 vcomiss xmm0, xmm1
6318 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6319
6320 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6321 IEMIMPL_SSE_EPILOGUE
6322 EPILOGUE_4_ARGS
6323ENDPROC iemAImpl_vcomiss_u128
6324
6325
6326;
6327; COMISD (SSE)
6328;
6329; @param A0 Pointer to the MXCSR value (input/output).
6330; @param A1 Pointer to the EFLAGS value (input/output).
6331; @param A2 Pointer to the first source operand (aka readonly destination).
6332; @param A3 Pointer to the second source operand.
6333;
6334BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6335 PROLOGUE_4_ARGS
6336 IEMIMPL_SSE_PROLOGUE
6337 SSE_LD_FXSTATE_MXCSR_ONLY A0
6338
6339 movdqu xmm0, [A2]
6340 movdqu xmm1, [A3]
6341 comisd xmm0, xmm1
6342 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6343
6344 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6345 IEMIMPL_SSE_EPILOGUE
6346 EPILOGUE_4_ARGS
6347ENDPROC iemAImpl_comisd_u128
6348
6349BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6350 PROLOGUE_4_ARGS
6351 IEMIMPL_SSE_PROLOGUE
6352 SSE_LD_FXSTATE_MXCSR_ONLY A0
6353
6354 movdqu xmm0, [A2]
6355 movdqu xmm1, [A3]
6356 vcomisd xmm0, xmm1
6357 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6358
6359 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6360 IEMIMPL_SSE_EPILOGUE
6361 EPILOGUE_4_ARGS
6362ENDPROC iemAImpl_vcomisd_u128
6363
6364
6365;;
6366; Need to move this as well somewhere better?
6367;
6368struc IEMMEDIAF2XMMSRC
6369 .uSrc1 resd 4
6370 .uSrc2 resd 4
6371endstruc
6372
6373
6374;
6375; CMPPS (SSE)
6376;
6377; @param A0 Pointer to the MXCSR value (input/output).
6378; @param A1 Pointer to the first media register size operand (output).
6379; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6380; @param A3 The 8-bit immediate (input).
6381;
6382BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6383 PROLOGUE_4_ARGS
6384 IEMIMPL_SSE_PROLOGUE
6385 SSE_LD_FXSTATE_MXCSR_ONLY A0
6386
6387 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6388 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6389 lea T1, [.imm0 xWrtRIP]
6390 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6391 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6392 %else
6393 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6394 %endif
6395 lea T1, [T1 + T0]
6396 IBT_NOTRACK
6397 call T1
6398 movdqu [A1], xmm0
6399
6400 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6401 IEMIMPL_SSE_EPILOGUE
6402 EPILOGUE_4_ARGS
6403 %assign bImm 0
6404 %rep 256
6405.imm %+ bImm:
6406 IBT_ENDBRxx_WITHOUT_NOTRACK
6407 cmpps xmm0, xmm1, bImm
6408 ret
6409 %assign bImm bImm + 1
6410 %endrep
6411.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6412ENDPROC iemAImpl_cmpps_u128
6413
6414;;
6415; SSE instructions with 8-bit immediates of the form
6416; xxx xmm1, xmm2, imm8.
6417; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6418; register.
6419;
6420; @param 1 The instruction name.
6421;
6422; @param A0 Pointer to the MXCSR value (input/output).
6423; @param A1 Pointer to the first media register size operand (output).
6424; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6425; @param A3 The 8-bit immediate (input).
6426;
6427%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6428BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6429 PROLOGUE_4_ARGS
6430 IEMIMPL_SSE_PROLOGUE
6431 SSE_LD_FXSTATE_MXCSR_ONLY A0
6432
6433 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6434 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6435 lea T1, [.imm0 xWrtRIP]
6436 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6437 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6438 %else
6439 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6440 %endif
6441 lea T1, [T1 + T0*2]
6442 IBT_NOTRACK
6443 call T1
6444 movdqu [A1], xmm0
6445
6446 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6447 IEMIMPL_SSE_EPILOGUE
6448 EPILOGUE_4_ARGS
6449 %assign bImm 0
6450 %rep 256
6451.imm %+ bImm:
6452 IBT_ENDBRxx_WITHOUT_NOTRACK
6453 %1 xmm0, xmm1, bImm
6454 ret
6455 %assign bImm bImm + 1
6456 %endrep
6457.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6458ENDPROC iemAImpl_ %+ %1 %+ _u128
6459%endmacro
6460
6461IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6462IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6463IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6464
6465;;
6466; SSE instructions with 8-bit immediates of the form
6467; xxx xmm1, xmm2, imm8.
6468; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6469; register.
6470;
6471; @param 1 The instruction name.
6472;
6473; @param A0 Pointer to the MXCSR value (input/output).
6474; @param A1 Pointer to the first media register size operand (output).
6475; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6476; @param A3 The 8-bit immediate (input).
6477;
6478%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6480 PROLOGUE_4_ARGS
6481 IEMIMPL_SSE_PROLOGUE
6482 SSE_LD_FXSTATE_MXCSR_ONLY A0
6483
6484 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6485 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6486 lea T1, [.imm0 xWrtRIP]
6487 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6488 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6489 lea T1, [T1 + T0*4]
6490 %else
6491 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6492 %endif
6493 IBT_NOTRACK
6494 call T1
6495 movdqu [A1], xmm0
6496
6497 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6498 IEMIMPL_SSE_EPILOGUE
6499 EPILOGUE_4_ARGS
6500 %assign bImm 0
6501 %rep 256
6502.imm %+ bImm:
6503 IBT_ENDBRxx_WITHOUT_NOTRACK
6504 %1 xmm0, xmm1, bImm
6505 ret
6506 int3
6507 %assign bImm bImm + 1
6508 %endrep
6509.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6510ENDPROC iemAImpl_ %+ %1 %+ _u128
6511%endmacro
6512
6513IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6514IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6515IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6516IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6517IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6518IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6519
6520
6521;;
6522; SSE instructions of the form
6523; xxx mm, xmm.
6524; and we need to load and save the MXCSR register.
6525;
6526; @param 1 The instruction name.
6527;
6528; @param A0 Pointer to the MXCSR value (input/output).
6529; @param A1 Pointer to the first MMX register sized operand (output).
6530; @param A2 Pointer to the media register sized operand (input).
6531;
6532%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6533BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6534 PROLOGUE_3_ARGS
6535 IEMIMPL_SSE_PROLOGUE
6536 SSE_LD_FXSTATE_MXCSR_ONLY A0
6537
6538 movdqu xmm0, [A2]
6539 %1 mm0, xmm0
6540 movq [A1], mm0
6541
6542 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6543 IEMIMPL_SSE_EPILOGUE
6544 EPILOGUE_3_ARGS
6545ENDPROC iemAImpl_ %+ %1 %+ _u128
6546%endmacro
6547
6548IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6549IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6550
6551;;
6552; SSE instructions of the form
6553; xxx xmm, xmm/m64.
6554; and we need to load and save the MXCSR register.
6555;
6556; @param 1 The instruction name.
6557;
6558; @param A0 Pointer to the MXCSR value (input/output).
6559; @param A1 Pointer to the first media register sized operand (input/output).
6560; @param A2 The 64bit source value from a MMX media register (input)
6561;
6562%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6563BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6564 PROLOGUE_3_ARGS
6565 IEMIMPL_SSE_PROLOGUE
6566 SSE_LD_FXSTATE_MXCSR_ONLY A0
6567
6568 movdqu xmm0, [A1]
6569 movq mm0, A2
6570 %1 xmm0, mm0
6571 movdqu [A1], xmm0
6572
6573 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6574 IEMIMPL_SSE_EPILOGUE
6575 EPILOGUE_3_ARGS
6576ENDPROC iemAImpl_ %+ %1 %+ _u128
6577%endmacro
6578
6579IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6580IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6581
6582;;
6583; SSE instructions of the form
6584; xxx mm, xmm/m64.
6585; and we need to load and save the MXCSR register.
6586;
6587; @param 1 The instruction name.
6588;
6589; @param A0 Pointer to the MXCSR value (input/output).
6590; @param A1 Pointer to the first MMX media register sized operand (output).
6591; @param A2 The 64bit source value (input).
6592;
6593%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6594BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6595 PROLOGUE_3_ARGS
6596 IEMIMPL_SSE_PROLOGUE
6597 SSE_LD_FXSTATE_MXCSR_ONLY A0
6598
6599 movq xmm0, A2
6600 %1 mm0, xmm0
6601 movq [A1], mm0
6602
6603 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6604 IEMIMPL_SSE_EPILOGUE
6605 EPILOGUE_3_ARGS
6606ENDPROC iemAImpl_ %+ %1 %+ _u128
6607%endmacro
6608
6609IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6610IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6611
6612;
6613; All forms of RDRAND and RDSEED
6614;
6615; @param A0 Pointer to the destination operand.
6616; @param A1 Pointer to the EFLAGS value (input/output).
6617;
6618%macro IEMIMPL_RDRAND_RDSEED 3
6619BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6620 PROLOGUE_2_ARGS
6621
6622 %1 %2
6623 mov [A0], %2
6624 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6625
6626 EPILOGUE_2_ARGS
6627ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6628%endmacro
6629
6630IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6631IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6632IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6633IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6634IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6635IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6636
6637
6638;;
6639; sha1rnds4 xmm1, xmm2, imm8.
6640;
6641; @param 1 The instruction name.
6642;
6643; @param A0 Pointer to the first media register size operand (input/output).
6644; @param A1 Pointer to the second source media register size operand (input).
6645; @param A2 The 8-bit immediate
6646;
6647BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6648 PROLOGUE_3_ARGS
6649 IEMIMPL_SSE_PROLOGUE
6650
6651 movdqu xmm0, [A0]
6652 movdqu xmm1, [A1]
6653 lea T1, [.imm0 xWrtRIP]
6654 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6655 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6656 %else
6657 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6658 %endif
6659 lea T1, [T1 + T0*2]
6660 IBT_NOTRACK
6661 call T1
6662 movdqu [A0], xmm0
6663
6664 IEMIMPL_SSE_EPILOGUE
6665 EPILOGUE_3_ARGS
6666 %assign bImm 0
6667 %rep 256
6668.imm %+ bImm:
6669 IBT_ENDBRxx_WITHOUT_NOTRACK
6670 sha1rnds4 xmm0, xmm1, bImm
6671 ret
6672 %assign bImm bImm + 1
6673 %endrep
6674.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6675ENDPROC iemAImpl_sha1rnds4_u128
6676
6677
6678;;
6679; sha256rnds2 xmm1, xmm2, <XMM0>.
6680;
6681; @param 1 The instruction name.
6682;
6683; @param A0 Pointer to the first media register size operand (input/output).
6684; @param A1 Pointer to the second source media register size operand (input).
6685; @param A2 Pointer to the implicit XMM0 constants (input).
6686;
6687BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6688 PROLOGUE_3_ARGS
6689 IEMIMPL_SSE_PROLOGUE
6690
6691 movdqu xmm0, [A2]
6692 movdqu xmm1, [A0]
6693 movdqu xmm2, [A1]
6694 sha256rnds2 xmm1, xmm2
6695 movdqu [A0], xmm1
6696
6697 IEMIMPL_SSE_EPILOGUE
6698 EPILOGUE_3_ARGS
6699ENDPROC iemAImpl_sha256rnds2_u128
6700
6701
6702;
6703; 32-bit forms of ADCX and ADOX
6704;
6705; @param A0 Pointer to the destination operand (input/output).
6706; @param A1 Pointer to the EFLAGS value (input/output).
6707; @param A2 32-bit source operand 1 (input).
6708;
6709%macro IEMIMPL_ADX_32 2
6710BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6711 PROLOGUE_4_ARGS
6712
6713 IEM_LOAD_FLAGS A1, %2, 0
6714 %1 A2_32, [A0]
6715 mov [A0], A2_32
6716 IEM_SAVE_FLAGS A1, %2, 0
6717
6718 EPILOGUE_4_ARGS
6719ENDPROC iemAImpl_ %+ %1 %+ _u32
6720%endmacro
6721
6722;
6723; 64-bit forms of ADCX and ADOX
6724;
6725; @param A0 Pointer to the destination operand (input/output).
6726; @param A1 Pointer to the EFLAGS value (input/output).
6727; @param A2 64-bit source operand 1 (input).
6728;
6729%macro IEMIMPL_ADX_64 2
6730BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6731 PROLOGUE_4_ARGS
6732
6733 IEM_LOAD_FLAGS A1, %2, 0
6734 %1 A2, [A0]
6735 mov [A0], A2
6736 IEM_SAVE_FLAGS A1, %2, 0
6737
6738 EPILOGUE_4_ARGS
6739ENDPROC iemAImpl_ %+ %1 %+ _u64
6740%endmacro
6741
6742IEMIMPL_ADX_32 adcx, X86_EFL_CF
6743IEMIMPL_ADX_64 adcx, X86_EFL_CF
6744
6745IEMIMPL_ADX_32 adox, X86_EFL_OF
6746IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette