VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 101189

Last change on this file since 101189 was 100851, checked in by vboxsync, 16 months ago

VMM/IEM: Make the assembly helpers hidden to avoid calling via the plt or stubs. bugref:10369

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 190.3 KB
Line 
1; $Id: IEMAllAImpl.asm 100851 2023-08-10 14:34:07Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %endif
143
144 %ifdef ASM_CALL64_MSC
145 %define A0 rcx
146 %define A0_32 ecx
147 %define A0_16 cx
148 %define A0_8 cl
149
150 %define A1 rdx
151 %define A1_32 edx
152 %define A1_16 dx
153 %define A1_8 dl
154
155 %define A2 r8
156 %define A2_32 r8d
157 %define A2_16 r8w
158 %define A2_8 r8b
159
160 %define A3 r9
161 %define A3_32 r9d
162 %define A3_16 r9w
163 %endif
164
165 %define T0 rax
166 %define T0_32 eax
167 %define T0_16 ax
168 %define T0_8 al
169
170 %define T1 r11
171 %define T1_32 r11d
172 %define T1_16 r11w
173 %define T1_8 r11b
174
175 %define T2 r10 ; only AMD64
176 %define T2_32 r10d
177 %define T2_16 r10w
178 %define T2_8 r10b
179
180%else
181 ; x86
182 %macro PROLOGUE_1_ARGS 0
183 push edi
184 %endmacro
185 %macro EPILOGUE_1_ARGS 0
186 pop edi
187 ret 0
188 %endmacro
189 %macro EPILOGUE_1_ARGS_EX 1
190 pop edi
191 ret %1
192 %endmacro
193
194 %macro PROLOGUE_2_ARGS 0
195 push edi
196 %endmacro
197 %macro EPILOGUE_2_ARGS 0
198 pop edi
199 ret 0
200 %endmacro
201 %macro EPILOGUE_2_ARGS_EX 1
202 pop edi
203 ret %1
204 %endmacro
205
206 %macro PROLOGUE_3_ARGS 0
207 push ebx
208 mov ebx, [esp + 4 + 4]
209 push edi
210 %endmacro
211 %macro EPILOGUE_3_ARGS_EX 1
212 %if (%1) < 4
213 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214 %endif
215 pop edi
216 pop ebx
217 ret %1
218 %endmacro
219 %macro EPILOGUE_3_ARGS 0
220 EPILOGUE_3_ARGS_EX 4
221 %endmacro
222
223 %macro PROLOGUE_4_ARGS 0
224 push ebx
225 push edi
226 push esi
227 mov ebx, [esp + 12 + 4 + 0]
228 mov esi, [esp + 12 + 4 + 4]
229 %endmacro
230 %macro EPILOGUE_4_ARGS_EX 1
231 %if (%1) < 8
232 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233 %endif
234 pop esi
235 pop edi
236 pop ebx
237 ret %1
238 %endmacro
239 %macro EPILOGUE_4_ARGS 0
240 EPILOGUE_4_ARGS_EX 8
241 %endmacro
242
243 %define A0 ecx
244 %define A0_32 ecx
245 %define A0_16 cx
246 %define A0_8 cl
247
248 %define A1 edx
249 %define A1_32 edx
250 %define A1_16 dx
251 %define A1_8 dl
252
253 %define A2 ebx
254 %define A2_32 ebx
255 %define A2_16 bx
256 %define A2_8 bl
257
258 %define A3 esi
259 %define A3_32 esi
260 %define A3_16 si
261
262 %define T0 eax
263 %define T0_32 eax
264 %define T0_16 ax
265 %define T0_8 al
266
267 %define T1 edi
268 %define T1_32 edi
269 %define T1_16 di
270%endif
271
272
273;;
274; Load the relevant flags from [%1] if there are undefined flags (%3).
275;
276; @remarks Clobbers T0, stack. Changes EFLAGS.
277; @param A2 The register pointing to the flags.
278; @param 1 The parameter (A0..A3) pointing to the eflags.
279; @param 2 The set of modified flags.
280; @param 3 The set of undefined flags.
281;
282%macro IEM_MAYBE_LOAD_FLAGS 3
283 ;%if (%3) != 0
284 pushf ; store current flags
285 mov T0_32, [%1] ; load the guest flags
286 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
287 and T0_32, (%2 | %3) ; select the modified and undefined flags.
288 or [xSP], T0 ; merge guest flags with host flags.
289 popf ; load the mixed flags.
290 ;%endif
291%endmacro
292
293;;
294; Load the relevant flags from [%1].
295;
296; @remarks Clobbers T0, stack. Changes EFLAGS.
297; @param A2 The register pointing to the flags.
298; @param 1 The parameter (A0..A3) pointing to the eflags.
299; @param 2 The set of flags to load.
300; @param 3 The set of undefined flags.
301;
302%macro IEM_LOAD_FLAGS 3
303 pushf ; store current flags
304 mov T0_32, [%1] ; load the guest flags
305 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
306 and T0_32, (%2 | %3) ; select the modified and undefined flags.
307 or [xSP], T0 ; merge guest flags with host flags.
308 popf ; load the mixed flags.
309%endmacro
310
311;;
312; Update the flag.
313;
314; @remarks Clobbers T0, T1, stack.
315; @param 1 The register pointing to the EFLAGS.
316; @param 2 The mask of modified flags to save.
317; @param 3 The mask of undefined flags to (maybe) save.
318;
319%macro IEM_SAVE_FLAGS 3
320 %if (%2 | %3) != 0
321 pushf
322 pop T1
323 mov T0_32, [%1] ; flags
324 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
325 and T1_32, (%2 | %3) ; select the modified and undefined flags.
326 or T0_32, T1_32 ; combine the flags.
327 mov [%1], T0_32 ; save the flags.
328 %endif
329%endmacro
330
331;;
332; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
333;
334; @remarks Clobbers T0, T1, stack.
335; @param 1 The register pointing to the EFLAGS.
336; @param 2 The mask of modified flags to save.
337; @param 3 Mask of additional flags to always clear
338; @param 4 Mask of additional flags to always set.
339;
340%macro IEM_SAVE_AND_ADJUST_FLAGS 4
341 %if (%2 | %3 | %4) != 0
342 pushf
343 pop T1
344 mov T0_32, [%1] ; load flags.
345 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
346 and T1_32, (%2) ; select the modified flags.
347 or T0_32, T1_32 ; combine the flags.
348 %if (%4) != 0
349 or T0_32, %4 ; add the always set flags.
350 %endif
351 mov [%1], T0_32 ; save the result.
352 %endif
353%endmacro
354
355;;
356; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
357; signed input (%4[%5]) and parity index (%6).
358;
359; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
360; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
361; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
362;
363; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
364; @param 1 The register pointing to the EFLAGS.
365; @param 2 The mask of modified flags to save.
366; @param 3 Mask of additional flags to always clear
367; @param 4 The result register to set SF by.
368; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
369; @param 6 The (full) register containing the parity table index. Will be modified!
370
371%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
372 %ifdef RT_ARCH_AMD64
373 pushf
374 pop T2
375 %else
376 push T0
377 pushf
378 pop T0
379 %endif
380 mov T1_32, [%1] ; load flags.
381 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
382 %ifdef RT_ARCH_AMD64
383 and T2_32, (%2) ; select the modified flags.
384 or T1_32, T2_32 ; combine the flags.
385 %else
386 and T0_32, (%2) ; select the modified flags.
387 or T1_32, T0_32 ; combine the flags.
388 pop T0
389 %endif
390
391 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
392 bt %4, %5 - 1
393 jnc %%sf_clear
394 or T1_32, X86_EFL_SF
395 %%sf_clear:
396
397 ; Parity last.
398 and %6, 0xff
399 %ifdef RT_ARCH_AMD64
400 lea T2, [NAME(g_afParity) xWrtRIP]
401 or T1_8, [T2 + %6]
402 %else
403 or T1_8, [NAME(g_afParity) + %6]
404 %endif
405
406 mov [%1], T1_32 ; save the result.
407%endmacro
408
409;;
410; Calculates the new EFLAGS using fixed clear and set bit masks.
411;
412; @remarks Clobbers T0.
413; @param 1 The register pointing to the EFLAGS.
414; @param 2 Mask of additional flags to always clear
415; @param 3 Mask of additional flags to always set.
416;
417%macro IEM_ADJUST_FLAGS 3
418 %if (%2 | %3) != 0
419 mov T0_32, [%1] ; Load flags.
420 %if (%2) != 0
421 and T0_32, ~(%2) ; Remove the always cleared flags.
422 %endif
423 %if (%3) != 0
424 or T0_32, %3 ; Add the always set flags.
425 %endif
426 mov [%1], T0_32 ; Save the result.
427 %endif
428%endmacro
429
430;;
431; Calculates the new EFLAGS using fixed clear and set bit masks.
432;
433; @remarks Clobbers T0, %4, EFLAGS.
434; @param 1 The register pointing to the EFLAGS.
435; @param 2 Mask of additional flags to always clear
436; @param 3 Mask of additional flags to always set.
437; @param 4 The (full) register containing the parity table index. Will be modified!
438;
439%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
440 mov T0_32, [%1] ; Load flags.
441 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
442 %if (%3) != 0
443 or T0_32, %3 ; Add the always set flags.
444 %endif
445 and %4, 0xff
446 %ifdef RT_ARCH_AMD64
447 lea T2, [NAME(g_afParity) xWrtRIP]
448 or T0_8, [T2 + %4]
449 %else
450 or T0_8, [NAME(g_afParity) + %4]
451 %endif
452 mov [%1], T0_32 ; Save the result.
453%endmacro
454
455
456;;
457; Checks that the size expression %1 matches %2 adjusted according to
458; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
459; @param 1 The jump array size assembly expression.
460; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
461;
462%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
463 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
464 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
465 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
466 %else
467 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
469 %endif
470%endmacro
471
472
473;*********************************************************************************************************************************
474;* External Symbols *
475;*********************************************************************************************************************************
476extern NAME(g_afParity)
477
478
479;;
480; Macro for implementing a binary operator.
481;
482; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
483; variants, except on 32-bit system where the 64-bit accesses requires hand
484; coding.
485;
486; All the functions takes a pointer to the destination memory operand in A0,
487; the source register operand in A1 and a pointer to eflags in A2.
488;
489; @param 1 The instruction mnemonic.
490; @param 2 Non-zero if there should be a locked version.
491; @param 3 The modified flags.
492; @param 4 The undefined flags.
493;
494%macro IEMIMPL_BIN_OP 4
495BEGINCODE
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 byte [A0], A1_8
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS
502ENDPROC iemAImpl_ %+ %1 %+ _u8
503
504BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
505 PROLOGUE_3_ARGS
506 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
507 %1 word [A0], A1_16
508 IEM_SAVE_FLAGS A2, %3, %4
509 EPILOGUE_3_ARGS
510ENDPROC iemAImpl_ %+ %1 %+ _u16
511
512BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
513 PROLOGUE_3_ARGS
514 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
515 %1 dword [A0], A1_32
516 IEM_SAVE_FLAGS A2, %3, %4
517 EPILOGUE_3_ARGS
518ENDPROC iemAImpl_ %+ %1 %+ _u32
519
520 %ifdef RT_ARCH_AMD64
521BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
522 PROLOGUE_3_ARGS
523 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
524 %1 qword [A0], A1
525 IEM_SAVE_FLAGS A2, %3, %4
526 EPILOGUE_3_ARGS_EX 8
527ENDPROC iemAImpl_ %+ %1 %+ _u64
528 %endif ; RT_ARCH_AMD64
529
530 %if %2 != 0 ; locked versions requested?
531
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 byte [A0], A1_8
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS
538ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
539
540BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
541 PROLOGUE_3_ARGS
542 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
543 lock %1 word [A0], A1_16
544 IEM_SAVE_FLAGS A2, %3, %4
545 EPILOGUE_3_ARGS
546ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
547
548BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
549 PROLOGUE_3_ARGS
550 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
551 lock %1 dword [A0], A1_32
552 IEM_SAVE_FLAGS A2, %3, %4
553 EPILOGUE_3_ARGS
554ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
555
556 %ifdef RT_ARCH_AMD64
557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
558 PROLOGUE_3_ARGS
559 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
560 lock %1 qword [A0], A1
561 IEM_SAVE_FLAGS A2, %3, %4
562 EPILOGUE_3_ARGS_EX 8
563ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
564 %endif ; RT_ARCH_AMD64
565 %endif ; locked
566%endmacro
567
568; instr,lock, modified-flags, undefined flags
569IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
570IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
571IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
572IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
573IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
574IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
575IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
576IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
577IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
578
579
580;;
581; Macro for implementing a binary operator, VEX variant with separate input/output.
582;
583; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
584; where the 64-bit accesses requires hand coding.
585;
586; All the functions takes a pointer to the destination memory operand in A0,
587; the first source register operand in A1, the second source register operand
588; in A2 and a pointer to eflags in A3.
589;
590; @param 1 The instruction mnemonic.
591; @param 2 The modified flags.
592; @param 3 The undefined flags.
593;
594%macro IEMIMPL_VEX_BIN_OP 3
595BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
596 PROLOGUE_4_ARGS
597 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
598 %1 T0_32, A1_32, A2_32
599 mov [A0], T0_32
600 IEM_SAVE_FLAGS A3, %2, %3
601 EPILOGUE_4_ARGS
602ENDPROC iemAImpl_ %+ %1 %+ _u32
603
604 %ifdef RT_ARCH_AMD64
605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
606 PROLOGUE_4_ARGS
607 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
608 %1 T0, A1, A2
609 mov [A0], T0
610 IEM_SAVE_FLAGS A3, %2, %3
611 EPILOGUE_4_ARGS
612ENDPROC iemAImpl_ %+ %1 %+ _u64
613 %endif ; RT_ARCH_AMD64
614%endmacro
615
616; instr, modified-flags, undefined-flags
617IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
618IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
619IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
620
621;;
622; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
623;
624; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
625; where the 64-bit accesses requires hand coding.
626;
627; All the functions takes a pointer to the destination memory operand in A0,
628; the source register operand in A1 and a pointer to eflags in A2.
629;
630; @param 1 The instruction mnemonic.
631; @param 2 The modified flags.
632; @param 3 The undefined flags.
633;
634%macro IEMIMPL_VEX_BIN_OP_2 3
635BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
636 PROLOGUE_4_ARGS
637 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
638 mov T0_32, [A0]
639 %1 T0_32, A1_32
640 mov [A0], T0_32
641 IEM_SAVE_FLAGS A2, %2, %3
642 EPILOGUE_4_ARGS
643ENDPROC iemAImpl_ %+ %1 %+ _u32
644
645 %ifdef RT_ARCH_AMD64
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0, [A0]
650 %1 T0, A1
651 mov [A0], T0
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u64
655 %endif ; RT_ARCH_AMD64
656%endmacro
657
658; instr, modified-flags, undefined-flags
659IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
660IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
661IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
662
663
664;;
665; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
666;
667; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
668; where the 64-bit accesses requires hand coding.
669;
670; All the functions takes a pointer to the destination memory operand in A0,
671; the first source register operand in A1, the second source register operand
672; in A2 and a pointer to eflags in A3.
673;
674; @param 1 The instruction mnemonic.
675; @param 2 Fallback instruction if applicable.
676; @param 3 Whether to emit fallback or not.
677;
678%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
680 PROLOGUE_3_ARGS
681 %1 T0_32, A1_32, A2_32
682 mov [A0], T0_32
683 EPILOGUE_3_ARGS
684ENDPROC iemAImpl_ %+ %1 %+ _u32
685
686 %if %3
687BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
688 PROLOGUE_3_ARGS
689 %ifdef ASM_CALL64_GCC
690 mov cl, A2_8
691 %2 A1_32, cl
692 mov [A0], A1_32
693 %else
694 xchg A2, A0
695 %2 A1_32, cl
696 mov [A2], A1_32
697 %endif
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
700 %endif
701
702 %ifdef RT_ARCH_AMD64
703BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
704 PROLOGUE_3_ARGS
705 %1 T0, A1, A2
706 mov [A0], T0
707 EPILOGUE_3_ARGS
708ENDPROC iemAImpl_ %+ %1 %+ _u64
709
710 %if %3
711BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
712 PROLOGUE_3_ARGS
713 %ifdef ASM_CALL64_GCC
714 mov cl, A2_8
715 %2 A1, cl
716 mov [A0], A1_32
717 %else
718 xchg A2, A0
719 %2 A1, cl
720 mov [A2], A1_32
721 %endif
722 mov [A0], A1
723 EPILOGUE_3_ARGS
724ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
725 %endif
726 %endif ; RT_ARCH_AMD64
727%endmacro
728
729; instr, fallback instr, emit fallback
730IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
731IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
732IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
733IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
734IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
735
736
737;
738; RORX uses a immediate byte for the shift count, so we only do
739; fallback implementation of that one.
740;
741BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
742 PROLOGUE_3_ARGS
743 %ifdef ASM_CALL64_GCC
744 mov cl, A2_8
745 ror A1_32, cl
746 mov [A0], A1_32
747 %else
748 xchg A2, A0
749 ror A1_32, cl
750 mov [A2], A1_32
751 %endif
752 EPILOGUE_3_ARGS
753ENDPROC iemAImpl_rorx_u32
754
755 %ifdef RT_ARCH_AMD64
756BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
757 PROLOGUE_3_ARGS
758 %ifdef ASM_CALL64_GCC
759 mov cl, A2_8
760 ror A1, cl
761 mov [A0], A1
762 %else
763 xchg A2, A0
764 ror A1, cl
765 mov [A2], A1
766 %endif
767 EPILOGUE_3_ARGS
768ENDPROC iemAImpl_rorx_u64
769 %endif ; RT_ARCH_AMD64
770
771
772;
773; MULX
774;
775BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
776 PROLOGUE_4_ARGS
777%ifdef ASM_CALL64_GCC
778 ; A2_32 is EDX - prefect
779 mulx T0_32, T1_32, A3_32
780 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
781 mov [A0], T0_32
782%else
783 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
784 xchg A1, A2
785 mulx T0_32, T1_32, A3_32
786 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
787 mov [A0], T0_32
788%endif
789 EPILOGUE_4_ARGS
790ENDPROC iemAImpl_mulx_u32
791
792
793BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
794 PROLOGUE_4_ARGS
795%ifdef ASM_CALL64_GCC
796 ; A2_32 is EDX, T0_32 is EAX
797 mov eax, A3_32
798 mul A2_32
799 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
800 mov [A0], edx
801%else
802 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
803 xchg A1, A2
804 mov eax, A3_32
805 mul A2_32
806 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
807 mov [A0], edx
808%endif
809 EPILOGUE_4_ARGS
810ENDPROC iemAImpl_mulx_u32_fallback
811
812%ifdef RT_ARCH_AMD64
813BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
814 PROLOGUE_4_ARGS
815%ifdef ASM_CALL64_GCC
816 ; A2 is RDX - prefect
817 mulx T0, T1, A3
818 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
819 mov [A0], T0
820%else
821 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
822 xchg A1, A2
823 mulx T0, T1, A3
824 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
825 mov [A0], T0
826%endif
827 EPILOGUE_4_ARGS
828ENDPROC iemAImpl_mulx_u64
829
830
831BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
832 PROLOGUE_4_ARGS
833%ifdef ASM_CALL64_GCC
834 ; A2 is RDX, T0 is RAX
835 mov rax, A3
836 mul A2
837 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
838 mov [A0], rdx
839%else
840 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
841 xchg A1, A2
842 mov rax, A3
843 mul A2
844 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
845 mov [A0], rdx
846%endif
847 EPILOGUE_4_ARGS
848ENDPROC iemAImpl_mulx_u64_fallback
849
850%endif
851
852
853;;
854; Macro for implementing a bit operator.
855;
856; This will generate code for the 16, 32 and 64 bit accesses with locked
857; variants, except on 32-bit system where the 64-bit accesses requires hand
858; coding.
859;
860; All the functions takes a pointer to the destination memory operand in A0,
861; the source register operand in A1 and a pointer to eflags in A2.
862;
863; @param 1 The instruction mnemonic.
864; @param 2 Non-zero if there should be a locked version.
865; @param 3 The modified flags.
866; @param 4 The undefined flags.
867;
868%macro IEMIMPL_BIT_OP 4
869BEGINCODE
870BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
871 PROLOGUE_3_ARGS
872 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
873 %1 word [A0], A1_16
874 IEM_SAVE_FLAGS A2, %3, %4
875 EPILOGUE_3_ARGS
876ENDPROC iemAImpl_ %+ %1 %+ _u16
877
878BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
879 PROLOGUE_3_ARGS
880 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
881 %1 dword [A0], A1_32
882 IEM_SAVE_FLAGS A2, %3, %4
883 EPILOGUE_3_ARGS
884ENDPROC iemAImpl_ %+ %1 %+ _u32
885
886 %ifdef RT_ARCH_AMD64
887BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
888 PROLOGUE_3_ARGS
889 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
890 %1 qword [A0], A1
891 IEM_SAVE_FLAGS A2, %3, %4
892 EPILOGUE_3_ARGS_EX 8
893ENDPROC iemAImpl_ %+ %1 %+ _u64
894 %endif ; RT_ARCH_AMD64
895
896 %if %2 != 0 ; locked versions requested?
897
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 lock %1 word [A0], A1_16
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS
904ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
905
906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
907 PROLOGUE_3_ARGS
908 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
909 lock %1 dword [A0], A1_32
910 IEM_SAVE_FLAGS A2, %3, %4
911 EPILOGUE_3_ARGS
912ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
913
914 %ifdef RT_ARCH_AMD64
915BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
916 PROLOGUE_3_ARGS
917 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
918 lock %1 qword [A0], A1
919 IEM_SAVE_FLAGS A2, %3, %4
920 EPILOGUE_3_ARGS_EX 8
921ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
922 %endif ; RT_ARCH_AMD64
923 %endif ; locked
924%endmacro
925IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
926IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
927IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
928IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
929
930;;
931; Macro for implementing a bit search operator.
932;
933; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
934; system where the 64-bit accesses requires hand coding.
935;
936; All the functions takes a pointer to the destination memory operand in A0,
937; the source register operand in A1 and a pointer to eflags in A2.
938;
939; In the ZF case the destination register is 'undefined', however it seems that
940; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
941; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
942; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
943; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
944;
945; @param 1 The instruction mnemonic.
946; @param 2 The modified flags.
947; @param 3 The undefined flags.
948; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
949;
950%macro IEMIMPL_BIT_OP2 4
951BEGINCODE
952BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
953 PROLOGUE_3_ARGS
954 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
955 %1 T0_16, A1_16
956%if %4 != 0
957 jz .unchanged_dst
958%endif
959 mov [A0], T0_16
960.unchanged_dst:
961 IEM_SAVE_FLAGS A2, %2, %3
962 EPILOGUE_3_ARGS
963ENDPROC iemAImpl_ %+ %1 %+ _u16
964
965BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
966 PROLOGUE_3_ARGS
967 %1 T1_16, A1_16
968%if %4 != 0
969 jz .unchanged_dst
970%endif
971 mov [A0], T1_16
972 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
973 EPILOGUE_3_ARGS
974.unchanged_dst:
975 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
976 EPILOGUE_3_ARGS
977ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
978
979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
980 PROLOGUE_3_ARGS
981 %1 T0_16, A1_16
982%if %4 != 0
983 jz .unchanged_dst
984%endif
985 mov [A0], T0_16
986.unchanged_dst:
987 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
988 EPILOGUE_3_ARGS
989ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
990
991
992BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
993 PROLOGUE_3_ARGS
994 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
995 %1 T0_32, A1_32
996%if %4 != 0
997 jz .unchanged_dst
998%endif
999 mov [A0], T0_32
1000.unchanged_dst:
1001 IEM_SAVE_FLAGS A2, %2, %3
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u32
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1006 PROLOGUE_3_ARGS
1007 %1 T1_32, A1_32
1008%if %4 != 0
1009 jz .unchanged_dst
1010%endif
1011 mov [A0], T1_32
1012 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1013 EPILOGUE_3_ARGS
1014.unchanged_dst:
1015 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1016 EPILOGUE_3_ARGS
1017ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1018
1019BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1020 PROLOGUE_3_ARGS
1021 %1 T0_32, A1_32
1022%if %4 != 0
1023 jz .unchanged_dst
1024%endif
1025 mov [A0], T0_32
1026.unchanged_dst:
1027 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1028 EPILOGUE_3_ARGS
1029ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1030
1031
1032 %ifdef RT_ARCH_AMD64
1033
1034BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1035 PROLOGUE_3_ARGS
1036 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1037 %1 T0, A1
1038%if %4 != 0
1039 jz .unchanged_dst
1040%endif
1041 mov [A0], T0
1042.unchanged_dst:
1043 IEM_SAVE_FLAGS A2, %2, %3
1044 EPILOGUE_3_ARGS_EX 8
1045ENDPROC iemAImpl_ %+ %1 %+ _u64
1046
1047BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1048 PROLOGUE_3_ARGS
1049 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1050 %1 T1, A1
1051%if %4 != 0
1052 jz .unchanged_dst
1053%endif
1054 mov [A0], T1
1055 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1056 EPILOGUE_3_ARGS
1057.unchanged_dst:
1058 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1059 EPILOGUE_3_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1061
1062BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1063 PROLOGUE_3_ARGS
1064 %1 T0, A1
1065%if %4 != 0
1066 jz .unchanged_dst
1067%endif
1068 mov [A0], T0
1069.unchanged_dst:
1070 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1071 EPILOGUE_3_ARGS_EX 8
1072ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1073
1074 %endif ; RT_ARCH_AMD64
1075%endmacro
1076
1077IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1078IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1079IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1080IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1081
1082
1083;;
1084; Macro for implementing POPCNT.
1085;
1086; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1087; system where the 64-bit accesses requires hand coding.
1088;
1089; All the functions takes a pointer to the destination memory operand in A0,
1090; the source register operand in A1 and a pointer to eflags in A2.
1091;
1092; ASSUMES Intel and AMD set EFLAGS the same way.
1093;
1094; ASSUMES the instruction does not support memory destination.
1095;
1096; @param 1 The instruction mnemonic.
1097; @param 2 The modified flags.
1098; @param 3 The undefined flags.
1099;
1100%macro IEMIMPL_BIT_OP3 3
1101BEGINCODE
1102BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1103 PROLOGUE_3_ARGS
1104 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1105 %1 T0_16, A1_16
1106 mov [A0], T0_16
1107 IEM_SAVE_FLAGS A2, %2, %3
1108 EPILOGUE_3_ARGS
1109ENDPROC iemAImpl_ %+ %1 %+ _u16
1110
1111BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1112 PROLOGUE_3_ARGS
1113 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1114 %1 T0_32, A1_32
1115 mov [A0], T0_32
1116 IEM_SAVE_FLAGS A2, %2, %3
1117 EPILOGUE_3_ARGS
1118ENDPROC iemAImpl_ %+ %1 %+ _u32
1119
1120 %ifdef RT_ARCH_AMD64
1121BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1124 %1 T0, A1
1125 mov [A0], T0
1126 IEM_SAVE_FLAGS A2, %2, %3
1127 EPILOGUE_3_ARGS_EX 8
1128ENDPROC iemAImpl_ %+ %1 %+ _u64
1129 %endif ; RT_ARCH_AMD64
1130%endmacro
1131IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1132
1133
1134;
1135; IMUL is also a similar but yet different case (no lock, no mem dst).
1136; The rDX:rAX variant of imul is handled together with mul further down.
1137;
1138BEGINCODE
1139; @param 1 EFLAGS that are modified.
1140; @param 2 Undefined EFLAGS.
1141; @param 3 Function suffix.
1142; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1143; 2 for AMD (set AF, clear PF, ZF and SF).
1144%macro IEMIMPL_IMUL_TWO 4
1145BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1146 PROLOGUE_3_ARGS
1147 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1148 imul A1_16, word [A0]
1149 mov [A0], A1_16
1150 %if %4 != 1
1151 IEM_SAVE_FLAGS A2, %1, %2
1152 %else
1153 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1154 %endif
1155 EPILOGUE_3_ARGS
1156ENDPROC iemAImpl_imul_two_u16 %+ %3
1157
1158BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1159 PROLOGUE_3_ARGS
1160 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1161 imul A1_32, dword [A0]
1162 mov [A0], A1_32
1163 %if %4 != 1
1164 IEM_SAVE_FLAGS A2, %1, %2
1165 %else
1166 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1167 %endif
1168 EPILOGUE_3_ARGS
1169ENDPROC iemAImpl_imul_two_u32 %+ %3
1170
1171 %ifdef RT_ARCH_AMD64
1172BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1173 PROLOGUE_3_ARGS
1174 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1175 imul A1, qword [A0]
1176 mov [A0], A1
1177 %if %4 != 1
1178 IEM_SAVE_FLAGS A2, %1, %2
1179 %else
1180 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1181 %endif
1182 EPILOGUE_3_ARGS_EX 8
1183ENDPROC iemAImpl_imul_two_u64 %+ %3
1184 %endif ; RT_ARCH_AMD64
1185%endmacro
1186IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1187IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1188IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1189
1190
1191;
1192; XCHG for memory operands. This implies locking. No flag changes.
1193;
1194; Each function takes two arguments, first the pointer to the memory,
1195; then the pointer to the register. They all return void.
1196;
1197BEGINCODE
1198BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1199 PROLOGUE_2_ARGS
1200 mov T0_8, [A1]
1201 xchg [A0], T0_8
1202 mov [A1], T0_8
1203 EPILOGUE_2_ARGS
1204ENDPROC iemAImpl_xchg_u8_locked
1205
1206BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1207 PROLOGUE_2_ARGS
1208 mov T0_16, [A1]
1209 xchg [A0], T0_16
1210 mov [A1], T0_16
1211 EPILOGUE_2_ARGS
1212ENDPROC iemAImpl_xchg_u16_locked
1213
1214BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1215 PROLOGUE_2_ARGS
1216 mov T0_32, [A1]
1217 xchg [A0], T0_32
1218 mov [A1], T0_32
1219 EPILOGUE_2_ARGS
1220ENDPROC iemAImpl_xchg_u32_locked
1221
1222%ifdef RT_ARCH_AMD64
1223BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1224 PROLOGUE_2_ARGS
1225 mov T0, [A1]
1226 xchg [A0], T0
1227 mov [A1], T0
1228 EPILOGUE_2_ARGS
1229ENDPROC iemAImpl_xchg_u64_locked
1230%endif
1231
1232; Unlocked variants for fDisregardLock mode.
1233
1234BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0_8, [A1]
1237 mov T1_8, [A0]
1238 mov [A0], T0_8
1239 mov [A1], T1_8
1240 EPILOGUE_2_ARGS
1241ENDPROC iemAImpl_xchg_u8_unlocked
1242
1243BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1244 PROLOGUE_2_ARGS
1245 mov T0_16, [A1]
1246 mov T1_16, [A0]
1247 mov [A0], T0_16
1248 mov [A1], T1_16
1249 EPILOGUE_2_ARGS
1250ENDPROC iemAImpl_xchg_u16_unlocked
1251
1252BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1253 PROLOGUE_2_ARGS
1254 mov T0_32, [A1]
1255 mov T1_32, [A0]
1256 mov [A0], T0_32
1257 mov [A1], T1_32
1258 EPILOGUE_2_ARGS
1259ENDPROC iemAImpl_xchg_u32_unlocked
1260
1261%ifdef RT_ARCH_AMD64
1262BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1263 PROLOGUE_2_ARGS
1264 mov T0, [A1]
1265 mov T1, [A0]
1266 mov [A0], T0
1267 mov [A1], T1
1268 EPILOGUE_2_ARGS
1269ENDPROC iemAImpl_xchg_u64_unlocked
1270%endif
1271
1272
1273;
1274; XADD for memory operands.
1275;
1276; Each function takes three arguments, first the pointer to the
1277; memory/register, then the pointer to the register, and finally a pointer to
1278; eflags. They all return void.
1279;
1280BEGINCODE
1281BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1282 PROLOGUE_3_ARGS
1283 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 mov T0_8, [A1]
1285 xadd [A0], T0_8
1286 mov [A1], T0_8
1287 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1288 EPILOGUE_3_ARGS
1289ENDPROC iemAImpl_xadd_u8
1290
1291BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1292 PROLOGUE_3_ARGS
1293 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1294 mov T0_16, [A1]
1295 xadd [A0], T0_16
1296 mov [A1], T0_16
1297 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1298 EPILOGUE_3_ARGS
1299ENDPROC iemAImpl_xadd_u16
1300
1301BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1302 PROLOGUE_3_ARGS
1303 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1304 mov T0_32, [A1]
1305 xadd [A0], T0_32
1306 mov [A1], T0_32
1307 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1308 EPILOGUE_3_ARGS
1309ENDPROC iemAImpl_xadd_u32
1310
1311%ifdef RT_ARCH_AMD64
1312BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0, [A1]
1316 xadd [A0], T0
1317 mov [A1], T0
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u64
1321%endif ; RT_ARCH_AMD64
1322
1323BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0_8, [A1]
1327 lock xadd [A0], T0_8
1328 mov [A1], T0_8
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u8_locked
1332
1333BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1334 PROLOGUE_3_ARGS
1335 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1336 mov T0_16, [A1]
1337 lock xadd [A0], T0_16
1338 mov [A1], T0_16
1339 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1340 EPILOGUE_3_ARGS
1341ENDPROC iemAImpl_xadd_u16_locked
1342
1343BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1344 PROLOGUE_3_ARGS
1345 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1346 mov T0_32, [A1]
1347 lock xadd [A0], T0_32
1348 mov [A1], T0_32
1349 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1350 EPILOGUE_3_ARGS
1351ENDPROC iemAImpl_xadd_u32_locked
1352
1353%ifdef RT_ARCH_AMD64
1354BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0, [A1]
1358 lock xadd [A0], T0
1359 mov [A1], T0
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u64_locked
1363%endif ; RT_ARCH_AMD64
1364
1365
1366;
1367; CMPXCHG8B.
1368;
1369; These are tricky register wise, so the code is duplicated for each calling
1370; convention.
1371;
1372; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1373;
1374; C-proto:
1375; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1376; uint32_t *pEFlags));
1377;
1378; Note! Identical to iemAImpl_cmpxchg16b.
1379;
1380BEGINCODE
1381BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1382%ifdef RT_ARCH_AMD64
1383 %ifdef ASM_CALL64_MSC
1384 push rbx
1385
1386 mov r11, rdx ; pu64EaxEdx (is also T1)
1387 mov r10, rcx ; pu64Dst
1388
1389 mov ebx, [r8]
1390 mov ecx, [r8 + 4]
1391 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1392 mov eax, [r11]
1393 mov edx, [r11 + 4]
1394
1395 cmpxchg8b [r10]
1396
1397 mov [r11], eax
1398 mov [r11 + 4], edx
1399 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1400
1401 pop rbx
1402 ret
1403 %else
1404 push rbx
1405
1406 mov r10, rcx ; pEFlags
1407 mov r11, rdx ; pu64EbxEcx (is also T1)
1408
1409 mov ebx, [r11]
1410 mov ecx, [r11 + 4]
1411 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1412 mov eax, [rsi]
1413 mov edx, [rsi + 4]
1414
1415 cmpxchg8b [rdi]
1416
1417 mov [rsi], eax
1418 mov [rsi + 4], edx
1419 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1420
1421 pop rbx
1422 ret
1423
1424 %endif
1425%else
1426 push esi
1427 push edi
1428 push ebx
1429 push ebp
1430
1431 mov edi, ecx ; pu64Dst
1432 mov esi, edx ; pu64EaxEdx
1433 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1434 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1435
1436 mov ebx, [ecx]
1437 mov ecx, [ecx + 4]
1438 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1439 mov eax, [esi]
1440 mov edx, [esi + 4]
1441
1442 cmpxchg8b [edi]
1443
1444 mov [esi], eax
1445 mov [esi + 4], edx
1446 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1447
1448 pop ebp
1449 pop ebx
1450 pop edi
1451 pop esi
1452 ret 8
1453%endif
1454ENDPROC iemAImpl_cmpxchg8b
1455
1456BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1457%ifdef RT_ARCH_AMD64
1458 %ifdef ASM_CALL64_MSC
1459 push rbx
1460
1461 mov r11, rdx ; pu64EaxEdx (is also T1)
1462 mov r10, rcx ; pu64Dst
1463
1464 mov ebx, [r8]
1465 mov ecx, [r8 + 4]
1466 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1467 mov eax, [r11]
1468 mov edx, [r11 + 4]
1469
1470 lock cmpxchg8b [r10]
1471
1472 mov [r11], eax
1473 mov [r11 + 4], edx
1474 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1475
1476 pop rbx
1477 ret
1478 %else
1479 push rbx
1480
1481 mov r10, rcx ; pEFlags
1482 mov r11, rdx ; pu64EbxEcx (is also T1)
1483
1484 mov ebx, [r11]
1485 mov ecx, [r11 + 4]
1486 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1487 mov eax, [rsi]
1488 mov edx, [rsi + 4]
1489
1490 lock cmpxchg8b [rdi]
1491
1492 mov [rsi], eax
1493 mov [rsi + 4], edx
1494 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1495
1496 pop rbx
1497 ret
1498
1499 %endif
1500%else
1501 push esi
1502 push edi
1503 push ebx
1504 push ebp
1505
1506 mov edi, ecx ; pu64Dst
1507 mov esi, edx ; pu64EaxEdx
1508 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1509 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1510
1511 mov ebx, [ecx]
1512 mov ecx, [ecx + 4]
1513 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1514 mov eax, [esi]
1515 mov edx, [esi + 4]
1516
1517 lock cmpxchg8b [edi]
1518
1519 mov [esi], eax
1520 mov [esi + 4], edx
1521 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1522
1523 pop ebp
1524 pop ebx
1525 pop edi
1526 pop esi
1527 ret 8
1528%endif
1529ENDPROC iemAImpl_cmpxchg8b_locked
1530
1531%ifdef RT_ARCH_AMD64
1532
1533;
1534; CMPXCHG16B.
1535;
1536; These are tricky register wise, so the code is duplicated for each calling
1537; convention.
1538;
1539; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1540;
1541; C-proto:
1542; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1543; uint32_t *pEFlags));
1544;
1545; Note! Identical to iemAImpl_cmpxchg8b.
1546;
1547BEGINCODE
1548BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1549 %ifdef ASM_CALL64_MSC
1550 push rbx
1551
1552 mov r11, rdx ; pu64RaxRdx (is also T1)
1553 mov r10, rcx ; pu64Dst
1554
1555 mov rbx, [r8]
1556 mov rcx, [r8 + 8]
1557 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1558 mov rax, [r11]
1559 mov rdx, [r11 + 8]
1560
1561 cmpxchg16b [r10]
1562
1563 mov [r11], rax
1564 mov [r11 + 8], rdx
1565 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1566
1567 pop rbx
1568 ret
1569 %else
1570 push rbx
1571
1572 mov r10, rcx ; pEFlags
1573 mov r11, rdx ; pu64RbxRcx (is also T1)
1574
1575 mov rbx, [r11]
1576 mov rcx, [r11 + 8]
1577 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1578 mov rax, [rsi]
1579 mov rdx, [rsi + 8]
1580
1581 cmpxchg16b [rdi]
1582
1583 mov [rsi], rax
1584 mov [rsi + 8], rdx
1585 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1586
1587 pop rbx
1588 ret
1589
1590 %endif
1591ENDPROC iemAImpl_cmpxchg16b
1592
1593BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1594 %ifdef ASM_CALL64_MSC
1595 push rbx
1596
1597 mov r11, rdx ; pu64RaxRdx (is also T1)
1598 mov r10, rcx ; pu64Dst
1599
1600 mov rbx, [r8]
1601 mov rcx, [r8 + 8]
1602 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1603 mov rax, [r11]
1604 mov rdx, [r11 + 8]
1605
1606 lock cmpxchg16b [r10]
1607
1608 mov [r11], rax
1609 mov [r11 + 8], rdx
1610 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1611
1612 pop rbx
1613 ret
1614 %else
1615 push rbx
1616
1617 mov r10, rcx ; pEFlags
1618 mov r11, rdx ; pu64RbxRcx (is also T1)
1619
1620 mov rbx, [r11]
1621 mov rcx, [r11 + 8]
1622 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1623 mov rax, [rsi]
1624 mov rdx, [rsi + 8]
1625
1626 lock cmpxchg16b [rdi]
1627
1628 mov [rsi], rax
1629 mov [rsi + 8], rdx
1630 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1631
1632 pop rbx
1633 ret
1634
1635 %endif
1636ENDPROC iemAImpl_cmpxchg16b_locked
1637
1638%endif ; RT_ARCH_AMD64
1639
1640
1641;
1642; CMPXCHG.
1643;
1644; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1645;
1646; C-proto:
1647; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1648;
1649BEGINCODE
1650%macro IEMIMPL_CMPXCHG 2
1651BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1652 PROLOGUE_4_ARGS
1653 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1654 mov al, [A1]
1655 %1 cmpxchg [A0], A2_8
1656 mov [A1], al
1657 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1658 EPILOGUE_4_ARGS
1659ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1660
1661BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1662 PROLOGUE_4_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1664 mov ax, [A1]
1665 %1 cmpxchg [A0], A2_16
1666 mov [A1], ax
1667 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1668 EPILOGUE_4_ARGS
1669ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1670
1671BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1672 PROLOGUE_4_ARGS
1673 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1674 mov eax, [A1]
1675 %1 cmpxchg [A0], A2_32
1676 mov [A1], eax
1677 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1678 EPILOGUE_4_ARGS
1679ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1680
1681BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1682%ifdef RT_ARCH_AMD64
1683 PROLOGUE_4_ARGS
1684 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1685 mov rax, [A1]
1686 %1 cmpxchg [A0], A2
1687 mov [A1], rax
1688 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1689 EPILOGUE_4_ARGS
1690%else
1691 ;
1692 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1693 ;
1694 push esi
1695 push edi
1696 push ebx
1697 push ebp
1698
1699 mov edi, ecx ; pu64Dst
1700 mov esi, edx ; pu64Rax
1701 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1702 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1703
1704 mov ebx, [ecx]
1705 mov ecx, [ecx + 4]
1706 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1707 mov eax, [esi]
1708 mov edx, [esi + 4]
1709
1710 lock cmpxchg8b [edi]
1711
1712 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1713 jz .cmpxchg8b_not_equal
1714 cmp eax, eax ; just set the other flags.
1715.store:
1716 mov [esi], eax
1717 mov [esi + 4], edx
1718 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1719
1720 pop ebp
1721 pop ebx
1722 pop edi
1723 pop esi
1724 ret 8
1725
1726.cmpxchg8b_not_equal:
1727 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1728 jne .store
1729 cmp [esi], eax
1730 jmp .store
1731
1732%endif
1733ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1734%endmacro ; IEMIMPL_CMPXCHG
1735
1736IEMIMPL_CMPXCHG , ,
1737IEMIMPL_CMPXCHG lock, _locked
1738
1739;;
1740; Macro for implementing a unary operator.
1741;
1742; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1743; variants, except on 32-bit system where the 64-bit accesses requires hand
1744; coding.
1745;
1746; All the functions takes a pointer to the destination memory operand in A0,
1747; the source register operand in A1 and a pointer to eflags in A2.
1748;
1749; @param 1 The instruction mnemonic.
1750; @param 2 The modified flags.
1751; @param 3 The undefined flags.
1752;
1753%macro IEMIMPL_UNARY_OP 3
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1756 PROLOGUE_2_ARGS
1757 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1758 %1 byte [A0]
1759 IEM_SAVE_FLAGS A1, %2, %3
1760 EPILOGUE_2_ARGS
1761ENDPROC iemAImpl_ %+ %1 %+ _u8
1762
1763BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1764 PROLOGUE_2_ARGS
1765 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1766 lock %1 byte [A0]
1767 IEM_SAVE_FLAGS A1, %2, %3
1768 EPILOGUE_2_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1772 PROLOGUE_2_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1774 %1 word [A0]
1775 IEM_SAVE_FLAGS A1, %2, %3
1776 EPILOGUE_2_ARGS
1777ENDPROC iemAImpl_ %+ %1 %+ _u16
1778
1779BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1780 PROLOGUE_2_ARGS
1781 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1782 lock %1 word [A0]
1783 IEM_SAVE_FLAGS A1, %2, %3
1784 EPILOGUE_2_ARGS
1785ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1786
1787BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1788 PROLOGUE_2_ARGS
1789 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1790 %1 dword [A0]
1791 IEM_SAVE_FLAGS A1, %2, %3
1792 EPILOGUE_2_ARGS
1793ENDPROC iemAImpl_ %+ %1 %+ _u32
1794
1795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1796 PROLOGUE_2_ARGS
1797 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1798 lock %1 dword [A0]
1799 IEM_SAVE_FLAGS A1, %2, %3
1800 EPILOGUE_2_ARGS
1801ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1802
1803 %ifdef RT_ARCH_AMD64
1804BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1805 PROLOGUE_2_ARGS
1806 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1807 %1 qword [A0]
1808 IEM_SAVE_FLAGS A1, %2, %3
1809 EPILOGUE_2_ARGS
1810ENDPROC iemAImpl_ %+ %1 %+ _u64
1811
1812BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1813 PROLOGUE_2_ARGS
1814 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1815 lock %1 qword [A0]
1816 IEM_SAVE_FLAGS A1, %2, %3
1817 EPILOGUE_2_ARGS
1818ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1819 %endif ; RT_ARCH_AMD64
1820
1821%endmacro
1822
1823IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1824IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1825IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1826IEMIMPL_UNARY_OP not, 0, 0
1827
1828
1829;
1830; BSWAP. No flag changes.
1831;
1832; Each function takes one argument, pointer to the value to bswap
1833; (input/output). They all return void.
1834;
1835BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1836 PROLOGUE_1_ARGS
1837 mov T0_32, [A0] ; just in case any of the upper bits are used.
1838 db 66h
1839 bswap T0_32
1840 mov [A0], T0_32
1841 EPILOGUE_1_ARGS
1842ENDPROC iemAImpl_bswap_u16
1843
1844BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1845 PROLOGUE_1_ARGS
1846 mov T0_32, [A0]
1847 bswap T0_32
1848 mov [A0], T0_32
1849 EPILOGUE_1_ARGS
1850ENDPROC iemAImpl_bswap_u32
1851
1852BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1853%ifdef RT_ARCH_AMD64
1854 PROLOGUE_1_ARGS
1855 mov T0, [A0]
1856 bswap T0
1857 mov [A0], T0
1858 EPILOGUE_1_ARGS
1859%else
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 mov T1, [A0 + 4]
1863 bswap T0
1864 bswap T1
1865 mov [A0 + 4], T0
1866 mov [A0], T1
1867 EPILOGUE_1_ARGS
1868%endif
1869ENDPROC iemAImpl_bswap_u64
1870
1871
1872;;
1873; Macro for implementing a shift operation.
1874;
1875; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1876; 32-bit system where the 64-bit accesses requires hand coding.
1877;
1878; All the functions takes a pointer to the destination memory operand in A0,
1879; the shift count in A1 and a pointer to eflags in A2.
1880;
1881; @param 1 The instruction mnemonic.
1882; @param 2 The modified flags.
1883; @param 3 The undefined flags.
1884;
1885; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1886;
1887; @note the _intel and _amd variants are implemented in C.
1888;
1889%macro IEMIMPL_SHIFT_OP 3
1890BEGINCODE
1891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1892 PROLOGUE_3_ARGS
1893 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1894 %ifdef ASM_CALL64_GCC
1895 mov cl, A1_8
1896 %1 byte [A0], cl
1897 %else
1898 xchg A1, A0
1899 %1 byte [A1], cl
1900 %endif
1901 IEM_SAVE_FLAGS A2, %2, %3
1902 EPILOGUE_3_ARGS
1903ENDPROC iemAImpl_ %+ %1 %+ _u8
1904
1905BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1906 PROLOGUE_3_ARGS
1907 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1908 %ifdef ASM_CALL64_GCC
1909 mov cl, A1_8
1910 %1 word [A0], cl
1911 %else
1912 xchg A1, A0
1913 %1 word [A1], cl
1914 %endif
1915 IEM_SAVE_FLAGS A2, %2, %3
1916 EPILOGUE_3_ARGS
1917ENDPROC iemAImpl_ %+ %1 %+ _u16
1918
1919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1920 PROLOGUE_3_ARGS
1921 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1922 %ifdef ASM_CALL64_GCC
1923 mov cl, A1_8
1924 %1 dword [A0], cl
1925 %else
1926 xchg A1, A0
1927 %1 dword [A1], cl
1928 %endif
1929 IEM_SAVE_FLAGS A2, %2, %3
1930 EPILOGUE_3_ARGS
1931ENDPROC iemAImpl_ %+ %1 %+ _u32
1932
1933 %ifdef RT_ARCH_AMD64
1934BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1935 PROLOGUE_3_ARGS
1936 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1937 %ifdef ASM_CALL64_GCC
1938 mov cl, A1_8
1939 %1 qword [A0], cl
1940 %else
1941 xchg A1, A0
1942 %1 qword [A1], cl
1943 %endif
1944 IEM_SAVE_FLAGS A2, %2, %3
1945 EPILOGUE_3_ARGS
1946ENDPROC iemAImpl_ %+ %1 %+ _u64
1947 %endif ; RT_ARCH_AMD64
1948
1949%endmacro
1950
1951IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1952IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1953IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1954IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1955IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1956IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1957IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1958
1959
1960;;
1961; Macro for implementing a double precision shift operation.
1962;
1963; This will generate code for the 16, 32 and 64 bit accesses, except on
1964; 32-bit system where the 64-bit accesses requires hand coding.
1965;
1966; The functions takes the destination operand (r/m) in A0, the source (reg) in
1967; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1968;
1969; @param 1 The instruction mnemonic.
1970; @param 2 The modified flags.
1971; @param 3 The undefined flags.
1972;
1973; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1974;
1975; @note the _intel and _amd variants are implemented in C.
1976;
1977%macro IEMIMPL_SHIFT_DBL_OP 3
1978BEGINCODE
1979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1980 PROLOGUE_4_ARGS
1981 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1982 %ifdef ASM_CALL64_GCC
1983 xchg A3, A2
1984 %1 [A0], A1_16, cl
1985 xchg A3, A2
1986 %else
1987 xchg A0, A2
1988 %1 [A2], A1_16, cl
1989 %endif
1990 IEM_SAVE_FLAGS A3, %2, %3
1991 EPILOGUE_4_ARGS
1992ENDPROC iemAImpl_ %+ %1 %+ _u16
1993
1994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1995 PROLOGUE_4_ARGS
1996 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1997 %ifdef ASM_CALL64_GCC
1998 xchg A3, A2
1999 %1 [A0], A1_32, cl
2000 xchg A3, A2
2001 %else
2002 xchg A0, A2
2003 %1 [A2], A1_32, cl
2004 %endif
2005 IEM_SAVE_FLAGS A3, %2, %3
2006 EPILOGUE_4_ARGS
2007ENDPROC iemAImpl_ %+ %1 %+ _u32
2008
2009 %ifdef RT_ARCH_AMD64
2010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2011 PROLOGUE_4_ARGS
2012 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2013 %ifdef ASM_CALL64_GCC
2014 xchg A3, A2
2015 %1 [A0], A1, cl
2016 xchg A3, A2
2017 %else
2018 xchg A0, A2
2019 %1 [A2], A1, cl
2020 %endif
2021 IEM_SAVE_FLAGS A3, %2, %3
2022 EPILOGUE_4_ARGS_EX 12
2023ENDPROC iemAImpl_ %+ %1 %+ _u64
2024 %endif ; RT_ARCH_AMD64
2025
2026%endmacro
2027
2028IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2029IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2030
2031
2032;;
2033; Macro for implementing a multiplication operations.
2034;
2035; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2036; 32-bit system where the 64-bit accesses requires hand coding.
2037;
2038; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2039; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2040; pointer to eflags in A3.
2041;
2042; The functions all return 0 so the caller can be used for div/idiv as well as
2043; for the mul/imul implementation.
2044;
2045; @param 1 The instruction mnemonic.
2046; @param 2 The modified flags.
2047; @param 3 The undefined flags.
2048; @param 4 Name suffix.
2049; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2050;
2051; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2052;
2053%macro IEMIMPL_MUL_OP 5
2054BEGINCODE
2055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2056 PROLOGUE_3_ARGS
2057 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2058 mov al, [A0]
2059 %1 A1_8
2060 mov [A0], ax
2061 %if %5 != 1
2062 IEM_SAVE_FLAGS A2, %2, %3
2063 %else
2064 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2065 %endif
2066 xor eax, eax
2067 EPILOGUE_3_ARGS
2068ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2069
2070BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2071 PROLOGUE_4_ARGS
2072 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2073 mov ax, [A0]
2074 %ifdef ASM_CALL64_GCC
2075 %1 A2_16
2076 mov [A0], ax
2077 mov [A1], dx
2078 %else
2079 mov T1, A1
2080 %1 A2_16
2081 mov [A0], ax
2082 mov [T1], dx
2083 %endif
2084 %if %5 != 1
2085 IEM_SAVE_FLAGS A3, %2, %3
2086 %else
2087 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2088 %endif
2089 xor eax, eax
2090 EPILOGUE_4_ARGS
2091ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2092
2093BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2094 PROLOGUE_4_ARGS
2095 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2096 mov eax, [A0]
2097 %ifdef ASM_CALL64_GCC
2098 %1 A2_32
2099 mov [A0], eax
2100 mov [A1], edx
2101 %else
2102 mov T1, A1
2103 %1 A2_32
2104 mov [A0], eax
2105 mov [T1], edx
2106 %endif
2107 %if %5 != 1
2108 IEM_SAVE_FLAGS A3, %2, %3
2109 %else
2110 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2111 %endif
2112 xor eax, eax
2113 EPILOGUE_4_ARGS
2114ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2115
2116 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2118 PROLOGUE_4_ARGS
2119 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2120 mov rax, [A0]
2121 %ifdef ASM_CALL64_GCC
2122 %1 A2
2123 mov [A0], rax
2124 mov [A1], rdx
2125 %else
2126 mov T1, A1
2127 %1 A2
2128 mov [A0], rax
2129 mov [T1], rdx
2130 %endif
2131 %if %5 != 1
2132 IEM_SAVE_FLAGS A3, %2, %3
2133 %else
2134 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2135 %endif
2136 xor eax, eax
2137 EPILOGUE_4_ARGS_EX 12
2138ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2139 %endif ; !RT_ARCH_AMD64
2140
2141%endmacro
2142
2143IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2144IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2145IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2146IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2147IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2148IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2149
2150
2151BEGINCODE
2152;;
2153; Worker function for negating a 32-bit number in T1:T0
2154; @uses None (T0,T1)
2155BEGINPROC iemAImpl_negate_T0_T1_u32
2156 push 0
2157 push 0
2158 xchg T0_32, [xSP]
2159 xchg T1_32, [xSP + xCB]
2160 sub T0_32, [xSP]
2161 sbb T1_32, [xSP + xCB]
2162 add xSP, xCB*2
2163 ret
2164ENDPROC iemAImpl_negate_T0_T1_u32
2165
2166%ifdef RT_ARCH_AMD64
2167;;
2168; Worker function for negating a 64-bit number in T1:T0
2169; @uses None (T0,T1)
2170BEGINPROC iemAImpl_negate_T0_T1_u64
2171 push 0
2172 push 0
2173 xchg T0, [xSP]
2174 xchg T1, [xSP + xCB]
2175 sub T0, [xSP]
2176 sbb T1, [xSP + xCB]
2177 add xSP, xCB*2
2178 ret
2179ENDPROC iemAImpl_negate_T0_T1_u64
2180%endif
2181
2182
2183;;
2184; Macro for implementing a division operations.
2185;
2186; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2187; 32-bit system where the 64-bit accesses requires hand coding.
2188;
2189; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2190; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2191; pointer to eflags in A3.
2192;
2193; The functions all return 0 on success and -1 if a divide error should be
2194; raised by the caller.
2195;
2196; @param 1 The instruction mnemonic.
2197; @param 2 The modified flags.
2198; @param 3 The undefined flags.
2199; @param 4 1 if signed, 0 if unsigned.
2200; @param 5 Function suffix.
2201; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2202; 2 for AMD (set AF, clear PF, ZF and SF).
2203;
2204; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2205;
2206%macro IEMIMPL_DIV_OP 6
2207BEGINCODE
2208BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2209 PROLOGUE_3_ARGS
2210
2211 ; div by chainsaw check.
2212 test A1_8, A1_8
2213 jz .div_zero
2214
2215 ; Overflow check - unsigned division is simple to verify, haven't
2216 ; found a simple way to check signed division yet unfortunately.
2217 %if %4 == 0
2218 cmp [A0 + 1], A1_8
2219 jae .div_overflow
2220 %else
2221 mov T0_16, [A0] ; T0 = dividend
2222 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2223 test A1_8, A1_8
2224 js .divisor_negative
2225 test T0_16, T0_16
2226 jns .both_positive
2227 neg T0_16
2228.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2229 push T0 ; Start off like unsigned below.
2230 shr T0_16, 7
2231 cmp T0_8, A1_8
2232 pop T0
2233 jb .div_no_overflow
2234 ja .div_overflow
2235 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2236 cmp T0_8, A1_8
2237 jae .div_overflow
2238 jmp .div_no_overflow
2239
2240.divisor_negative:
2241 neg A1_8
2242 test T0_16, T0_16
2243 jns .one_of_each
2244 neg T0_16
2245.both_positive: ; Same as unsigned shifted by sign indicator bit.
2246 shr T0_16, 7
2247 cmp T0_8, A1_8
2248 jae .div_overflow
2249.div_no_overflow:
2250 mov A1, T1 ; restore divisor
2251 %endif
2252
2253 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2254 mov ax, [A0]
2255 %1 A1_8
2256 mov [A0], ax
2257 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2258 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2259 %else
2260 IEM_SAVE_FLAGS A2, %2, %3
2261 %endif
2262 xor eax, eax
2263
2264.return:
2265 EPILOGUE_3_ARGS
2266
2267.div_zero:
2268.div_overflow:
2269 mov eax, -1
2270 jmp .return
2271ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2272
2273BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2274 PROLOGUE_4_ARGS
2275
2276 ; div by chainsaw check.
2277 test A2_16, A2_16
2278 jz .div_zero
2279
2280 ; Overflow check - unsigned division is simple to verify, haven't
2281 ; found a simple way to check signed division yet unfortunately.
2282 %if %4 == 0
2283 cmp [A1], A2_16
2284 jae .div_overflow
2285 %else
2286 mov T0_16, [A1]
2287 shl T0_32, 16
2288 mov T0_16, [A0] ; T0 = dividend
2289 mov T1, A2 ; T1 = divisor
2290 test T1_16, T1_16
2291 js .divisor_negative
2292 test T0_32, T0_32
2293 jns .both_positive
2294 neg T0_32
2295.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2296 push T0 ; Start off like unsigned below.
2297 shr T0_32, 15
2298 cmp T0_16, T1_16
2299 pop T0
2300 jb .div_no_overflow
2301 ja .div_overflow
2302 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2303 cmp T0_16, T1_16
2304 jae .div_overflow
2305 jmp .div_no_overflow
2306
2307.divisor_negative:
2308 neg T1_16
2309 test T0_32, T0_32
2310 jns .one_of_each
2311 neg T0_32
2312.both_positive: ; Same as unsigned shifted by sign indicator bit.
2313 shr T0_32, 15
2314 cmp T0_16, T1_16
2315 jae .div_overflow
2316.div_no_overflow:
2317 %endif
2318
2319 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2320 %ifdef ASM_CALL64_GCC
2321 mov T1, A2
2322 mov ax, [A0]
2323 mov dx, [A1]
2324 %1 T1_16
2325 mov [A0], ax
2326 mov [A1], dx
2327 %else
2328 mov T1, A1
2329 mov ax, [A0]
2330 mov dx, [T1]
2331 %1 A2_16
2332 mov [A0], ax
2333 mov [T1], dx
2334 %endif
2335 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2336 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2337 %else
2338 IEM_SAVE_FLAGS A3, %2, %3
2339 %endif
2340 xor eax, eax
2341
2342.return:
2343 EPILOGUE_4_ARGS
2344
2345.div_zero:
2346.div_overflow:
2347 mov eax, -1
2348 jmp .return
2349ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2350
2351BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2352 PROLOGUE_4_ARGS
2353
2354 ; div by chainsaw check.
2355 test A2_32, A2_32
2356 jz .div_zero
2357
2358 ; Overflow check - unsigned division is simple to verify, haven't
2359 ; found a simple way to check signed division yet unfortunately.
2360 %if %4 == 0
2361 cmp [A1], A2_32
2362 jae .div_overflow
2363 %else
2364 push A2 ; save A2 so we modify it (we out of regs on x86).
2365 mov T0_32, [A0] ; T0 = dividend low
2366 mov T1_32, [A1] ; T1 = dividend high
2367 test A2_32, A2_32
2368 js .divisor_negative
2369 test T1_32, T1_32
2370 jns .both_positive
2371 call NAME(iemAImpl_negate_T0_T1_u32)
2372.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2373 push T0 ; Start off like unsigned below.
2374 shl T1_32, 1
2375 shr T0_32, 31
2376 or T1_32, T0_32
2377 cmp T1_32, A2_32
2378 pop T0
2379 jb .div_no_overflow
2380 ja .div_overflow
2381 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2382 cmp T0_32, A2_32
2383 jae .div_overflow
2384 jmp .div_no_overflow
2385
2386.divisor_negative:
2387 neg A2_32
2388 test T1_32, T1_32
2389 jns .one_of_each
2390 call NAME(iemAImpl_negate_T0_T1_u32)
2391.both_positive: ; Same as unsigned shifted by sign indicator bit.
2392 shl T1_32, 1
2393 shr T0_32, 31
2394 or T1_32, T0_32
2395 cmp T1_32, A2_32
2396 jae .div_overflow
2397.div_no_overflow:
2398 pop A2
2399 %endif
2400
2401 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2402 mov eax, [A0]
2403 %ifdef ASM_CALL64_GCC
2404 mov T1, A2
2405 mov eax, [A0]
2406 mov edx, [A1]
2407 %1 T1_32
2408 mov [A0], eax
2409 mov [A1], edx
2410 %else
2411 mov T1, A1
2412 mov eax, [A0]
2413 mov edx, [T1]
2414 %1 A2_32
2415 mov [A0], eax
2416 mov [T1], edx
2417 %endif
2418 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2419 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2420 %else
2421 IEM_SAVE_FLAGS A3, %2, %3
2422 %endif
2423 xor eax, eax
2424
2425.return:
2426 EPILOGUE_4_ARGS
2427
2428.div_overflow:
2429 %if %4 != 0
2430 pop A2
2431 %endif
2432.div_zero:
2433 mov eax, -1
2434 jmp .return
2435ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2436
2437 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2438BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2439 PROLOGUE_4_ARGS
2440
2441 test A2, A2
2442 jz .div_zero
2443 %if %4 == 0
2444 cmp [A1], A2
2445 jae .div_overflow
2446 %else
2447 push A2 ; save A2 so we modify it (we out of regs on x86).
2448 mov T0, [A0] ; T0 = dividend low
2449 mov T1, [A1] ; T1 = dividend high
2450 test A2, A2
2451 js .divisor_negative
2452 test T1, T1
2453 jns .both_positive
2454 call NAME(iemAImpl_negate_T0_T1_u64)
2455.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2456 push T0 ; Start off like unsigned below.
2457 shl T1, 1
2458 shr T0, 63
2459 or T1, T0
2460 cmp T1, A2
2461 pop T0
2462 jb .div_no_overflow
2463 ja .div_overflow
2464 mov T1, 0x7fffffffffffffff
2465 and T0, T1 ; Special case for covering (divisor - 1).
2466 cmp T0, A2
2467 jae .div_overflow
2468 jmp .div_no_overflow
2469
2470.divisor_negative:
2471 neg A2
2472 test T1, T1
2473 jns .one_of_each
2474 call NAME(iemAImpl_negate_T0_T1_u64)
2475.both_positive: ; Same as unsigned shifted by sign indicator bit.
2476 shl T1, 1
2477 shr T0, 63
2478 or T1, T0
2479 cmp T1, A2
2480 jae .div_overflow
2481.div_no_overflow:
2482 pop A2
2483 %endif
2484
2485 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2486 mov rax, [A0]
2487 %ifdef ASM_CALL64_GCC
2488 mov T1, A2
2489 mov rax, [A0]
2490 mov rdx, [A1]
2491 %1 T1
2492 mov [A0], rax
2493 mov [A1], rdx
2494 %else
2495 mov T1, A1
2496 mov rax, [A0]
2497 mov rdx, [T1]
2498 %1 A2
2499 mov [A0], rax
2500 mov [T1], rdx
2501 %endif
2502 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2503 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2504 %else
2505 IEM_SAVE_FLAGS A3, %2, %3
2506 %endif
2507 xor eax, eax
2508
2509.return:
2510 EPILOGUE_4_ARGS_EX 12
2511
2512.div_overflow:
2513 %if %4 != 0
2514 pop A2
2515 %endif
2516.div_zero:
2517 mov eax, -1
2518 jmp .return
2519ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2520 %endif ; !RT_ARCH_AMD64
2521
2522%endmacro
2523
2524IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2525IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2526IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2527IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2528IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2529IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2530
2531
2532;;
2533; Macro for implementing memory fence operation.
2534;
2535; No return value, no operands or anything.
2536;
2537; @param 1 The instruction.
2538;
2539%macro IEMIMPL_MEM_FENCE 1
2540BEGINCODE
2541BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2542 %1
2543 ret
2544ENDPROC iemAImpl_ %+ %1
2545%endmacro
2546
2547IEMIMPL_MEM_FENCE lfence
2548IEMIMPL_MEM_FENCE sfence
2549IEMIMPL_MEM_FENCE mfence
2550
2551;;
2552; Alternative for non-SSE2 host.
2553;
2554BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2555 push xAX
2556 xchg xAX, [xSP]
2557 add xSP, xCB
2558 ret
2559ENDPROC iemAImpl_alt_mem_fence
2560
2561
2562;;
2563; Initialize the FPU for the actual instruction being emulated, this means
2564; loading parts of the guest's control word and status word.
2565;
2566; @uses 24 bytes of stack. T0, T1
2567; @param 1 Expression giving the address of the FXSTATE of the guest.
2568;
2569%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2570 fnstenv [xSP]
2571
2572 ; FCW - for exception, precision and rounding control.
2573 movzx T0, word [%1 + X86FXSTATE.FCW]
2574 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2575 mov [xSP + X86FSTENV32P.FCW], T0_16
2576
2577 ; FSW - for undefined C0, C1, C2, and C3.
2578 movzx T1, word [%1 + X86FXSTATE.FSW]
2579 and T1, X86_FSW_C_MASK
2580 movzx T0, word [xSP + X86FSTENV32P.FSW]
2581 and T0, X86_FSW_TOP_MASK
2582 or T0, T1
2583 mov [xSP + X86FSTENV32P.FSW], T0_16
2584
2585 fldenv [xSP]
2586%endmacro
2587
2588
2589;;
2590; Initialize the FPU for the actual instruction being emulated, this means
2591; loading parts of the guest's control word, status word, and update the
2592; tag word for the top register if it's empty.
2593;
2594; ASSUMES actual TOP=7
2595;
2596; @uses 24 bytes of stack. T0, T1
2597; @param 1 Expression giving the address of the FXSTATE of the guest.
2598;
2599%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2600 fnstenv [xSP]
2601
2602 ; FCW - for exception, precision and rounding control.
2603 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2604 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2605 mov [xSP + X86FSTENV32P.FCW], T0_16
2606
2607 ; FSW - for undefined C0, C1, C2, and C3.
2608 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2609 and T1_32, X86_FSW_C_MASK
2610 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2611 and T0_32, X86_FSW_TOP_MASK
2612 or T0_32, T1_32
2613 mov [xSP + X86FSTENV32P.FSW], T0_16
2614
2615 ; FTW - Only for ST0 (in/out).
2616 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2617 shr T1_32, X86_FSW_TOP_SHIFT
2618 and T1_32, X86_FSW_TOP_SMASK
2619 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2620 jc %%st0_not_empty
2621 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2622%%st0_not_empty:
2623
2624 fldenv [xSP]
2625%endmacro
2626
2627
2628;;
2629; Need to move this as well somewhere better?
2630;
2631struc IEMFPURESULT
2632 .r80Result resw 5
2633 .FSW resw 1
2634endstruc
2635
2636
2637;;
2638; Need to move this as well somewhere better?
2639;
2640struc IEMFPURESULTTWO
2641 .r80Result1 resw 5
2642 .FSW resw 1
2643 .r80Result2 resw 5
2644endstruc
2645
2646
2647;
2648;---------------------- 16-bit signed integer operations ----------------------
2649;
2650
2651
2652;;
2653; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2654;
2655; @param A0 FPU context (fxsave).
2656; @param A1 Pointer to a IEMFPURESULT for the output.
2657; @param A2 Pointer to the 16-bit floating point value to convert.
2658;
2659BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2660 PROLOGUE_3_ARGS
2661 sub xSP, 20h
2662
2663 fninit
2664 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2665 fild word [A2]
2666
2667 fnstsw word [A1 + IEMFPURESULT.FSW]
2668 fnclex
2669 fstp tword [A1 + IEMFPURESULT.r80Result]
2670
2671 fninit
2672 add xSP, 20h
2673 EPILOGUE_3_ARGS
2674ENDPROC iemAImpl_fild_r80_from_i16
2675
2676
2677;;
2678; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2679;
2680; @param A0 FPU context (fxsave).
2681; @param A1 Where to return the output FSW.
2682; @param A2 Where to store the 16-bit signed integer value.
2683; @param A3 Pointer to the 80-bit value.
2684;
2685BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2686 PROLOGUE_4_ARGS
2687 sub xSP, 20h
2688
2689 fninit
2690 fld tword [A3]
2691 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2692 fistp word [A2]
2693
2694 fnstsw word [A1]
2695
2696 fninit
2697 add xSP, 20h
2698 EPILOGUE_4_ARGS
2699ENDPROC iemAImpl_fist_r80_to_i16
2700
2701
2702;;
2703; Store a 80-bit floating point value (register) as a 16-bit signed integer
2704; (memory) with truncation.
2705;
2706; @param A0 FPU context (fxsave).
2707; @param A1 Where to return the output FSW.
2708; @param A2 Where to store the 16-bit signed integer value.
2709; @param A3 Pointer to the 80-bit value.
2710;
2711BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2712 PROLOGUE_4_ARGS
2713 sub xSP, 20h
2714
2715 fninit
2716 fld tword [A3]
2717 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2718 fisttp word [A2]
2719
2720 fnstsw word [A1]
2721
2722 fninit
2723 add xSP, 20h
2724 EPILOGUE_4_ARGS
2725ENDPROC iemAImpl_fistt_r80_to_i16
2726
2727
2728;;
2729; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2730;
2731; @param 1 The instruction
2732;
2733; @param A0 FPU context (fxsave).
2734; @param A1 Pointer to a IEMFPURESULT for the output.
2735; @param A2 Pointer to the 80-bit value.
2736; @param A3 Pointer to the 16-bit value.
2737;
2738%macro IEMIMPL_FPU_R80_BY_I16 1
2739BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2740 PROLOGUE_4_ARGS
2741 sub xSP, 20h
2742
2743 fninit
2744 fld tword [A2]
2745 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2746 %1 word [A3]
2747
2748 fnstsw word [A1 + IEMFPURESULT.FSW]
2749 fnclex
2750 fstp tword [A1 + IEMFPURESULT.r80Result]
2751
2752 fninit
2753 add xSP, 20h
2754 EPILOGUE_4_ARGS
2755ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2756%endmacro
2757
2758IEMIMPL_FPU_R80_BY_I16 fiadd
2759IEMIMPL_FPU_R80_BY_I16 fimul
2760IEMIMPL_FPU_R80_BY_I16 fisub
2761IEMIMPL_FPU_R80_BY_I16 fisubr
2762IEMIMPL_FPU_R80_BY_I16 fidiv
2763IEMIMPL_FPU_R80_BY_I16 fidivr
2764
2765
2766;;
2767; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2768; only returning FSW.
2769;
2770; @param 1 The instruction
2771;
2772; @param A0 FPU context (fxsave).
2773; @param A1 Where to store the output FSW.
2774; @param A2 Pointer to the 80-bit value.
2775; @param A3 Pointer to the 64-bit value.
2776;
2777%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2778BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2779 PROLOGUE_4_ARGS
2780 sub xSP, 20h
2781
2782 fninit
2783 fld tword [A2]
2784 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2785 %1 word [A3]
2786
2787 fnstsw word [A1]
2788
2789 fninit
2790 add xSP, 20h
2791 EPILOGUE_4_ARGS
2792ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2793%endmacro
2794
2795IEMIMPL_FPU_R80_BY_I16_FSW ficom
2796
2797
2798
2799;
2800;---------------------- 32-bit signed integer operations ----------------------
2801;
2802
2803
2804;;
2805; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2806;
2807; @param A0 FPU context (fxsave).
2808; @param A1 Pointer to a IEMFPURESULT for the output.
2809; @param A2 Pointer to the 32-bit floating point value to convert.
2810;
2811BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2812 PROLOGUE_3_ARGS
2813 sub xSP, 20h
2814
2815 fninit
2816 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2817 fild dword [A2]
2818
2819 fnstsw word [A1 + IEMFPURESULT.FSW]
2820 fnclex
2821 fstp tword [A1 + IEMFPURESULT.r80Result]
2822
2823 fninit
2824 add xSP, 20h
2825 EPILOGUE_3_ARGS
2826ENDPROC iemAImpl_fild_r80_from_i32
2827
2828
2829;;
2830; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2831;
2832; @param A0 FPU context (fxsave).
2833; @param A1 Where to return the output FSW.
2834; @param A2 Where to store the 32-bit signed integer value.
2835; @param A3 Pointer to the 80-bit value.
2836;
2837BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2838 PROLOGUE_4_ARGS
2839 sub xSP, 20h
2840
2841 fninit
2842 fld tword [A3]
2843 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2844 fistp dword [A2]
2845
2846 fnstsw word [A1]
2847
2848 fninit
2849 add xSP, 20h
2850 EPILOGUE_4_ARGS
2851ENDPROC iemAImpl_fist_r80_to_i32
2852
2853
2854;;
2855; Store a 80-bit floating point value (register) as a 32-bit signed integer
2856; (memory) with truncation.
2857;
2858; @param A0 FPU context (fxsave).
2859; @param A1 Where to return the output FSW.
2860; @param A2 Where to store the 32-bit signed integer value.
2861; @param A3 Pointer to the 80-bit value.
2862;
2863BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2864 PROLOGUE_4_ARGS
2865 sub xSP, 20h
2866
2867 fninit
2868 fld tword [A3]
2869 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2870 fisttp dword [A2]
2871
2872 fnstsw word [A1]
2873
2874 fninit
2875 add xSP, 20h
2876 EPILOGUE_4_ARGS
2877ENDPROC iemAImpl_fistt_r80_to_i32
2878
2879
2880;;
2881; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2882;
2883; @param 1 The instruction
2884;
2885; @param A0 FPU context (fxsave).
2886; @param A1 Pointer to a IEMFPURESULT for the output.
2887; @param A2 Pointer to the 80-bit value.
2888; @param A3 Pointer to the 32-bit value.
2889;
2890%macro IEMIMPL_FPU_R80_BY_I32 1
2891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2892 PROLOGUE_4_ARGS
2893 sub xSP, 20h
2894
2895 fninit
2896 fld tword [A2]
2897 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2898 %1 dword [A3]
2899
2900 fnstsw word [A1 + IEMFPURESULT.FSW]
2901 fnclex
2902 fstp tword [A1 + IEMFPURESULT.r80Result]
2903
2904 fninit
2905 add xSP, 20h
2906 EPILOGUE_4_ARGS
2907ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2908%endmacro
2909
2910IEMIMPL_FPU_R80_BY_I32 fiadd
2911IEMIMPL_FPU_R80_BY_I32 fimul
2912IEMIMPL_FPU_R80_BY_I32 fisub
2913IEMIMPL_FPU_R80_BY_I32 fisubr
2914IEMIMPL_FPU_R80_BY_I32 fidiv
2915IEMIMPL_FPU_R80_BY_I32 fidivr
2916
2917
2918;;
2919; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2920; only returning FSW.
2921;
2922; @param 1 The instruction
2923;
2924; @param A0 FPU context (fxsave).
2925; @param A1 Where to store the output FSW.
2926; @param A2 Pointer to the 80-bit value.
2927; @param A3 Pointer to the 64-bit value.
2928;
2929%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2931 PROLOGUE_4_ARGS
2932 sub xSP, 20h
2933
2934 fninit
2935 fld tword [A2]
2936 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2937 %1 dword [A3]
2938
2939 fnstsw word [A1]
2940
2941 fninit
2942 add xSP, 20h
2943 EPILOGUE_4_ARGS
2944ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2945%endmacro
2946
2947IEMIMPL_FPU_R80_BY_I32_FSW ficom
2948
2949
2950
2951;
2952;---------------------- 64-bit signed integer operations ----------------------
2953;
2954
2955
2956;;
2957; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2958;
2959; @param A0 FPU context (fxsave).
2960; @param A1 Pointer to a IEMFPURESULT for the output.
2961; @param A2 Pointer to the 64-bit floating point value to convert.
2962;
2963BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2964 PROLOGUE_3_ARGS
2965 sub xSP, 20h
2966
2967 fninit
2968 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2969 fild qword [A2]
2970
2971 fnstsw word [A1 + IEMFPURESULT.FSW]
2972 fnclex
2973 fstp tword [A1 + IEMFPURESULT.r80Result]
2974
2975 fninit
2976 add xSP, 20h
2977 EPILOGUE_3_ARGS
2978ENDPROC iemAImpl_fild_r80_from_i64
2979
2980
2981;;
2982; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2983;
2984; @param A0 FPU context (fxsave).
2985; @param A1 Where to return the output FSW.
2986; @param A2 Where to store the 64-bit signed integer value.
2987; @param A3 Pointer to the 80-bit value.
2988;
2989BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2990 PROLOGUE_4_ARGS
2991 sub xSP, 20h
2992
2993 fninit
2994 fld tword [A3]
2995 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2996 fistp qword [A2]
2997
2998 fnstsw word [A1]
2999
3000 fninit
3001 add xSP, 20h
3002 EPILOGUE_4_ARGS
3003ENDPROC iemAImpl_fist_r80_to_i64
3004
3005
3006;;
3007; Store a 80-bit floating point value (register) as a 64-bit signed integer
3008; (memory) with truncation.
3009;
3010; @param A0 FPU context (fxsave).
3011; @param A1 Where to return the output FSW.
3012; @param A2 Where to store the 64-bit signed integer value.
3013; @param A3 Pointer to the 80-bit value.
3014;
3015BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3016 PROLOGUE_4_ARGS
3017 sub xSP, 20h
3018
3019 fninit
3020 fld tword [A3]
3021 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3022 fisttp qword [A2]
3023
3024 fnstsw word [A1]
3025
3026 fninit
3027 add xSP, 20h
3028 EPILOGUE_4_ARGS
3029ENDPROC iemAImpl_fistt_r80_to_i64
3030
3031
3032
3033;
3034;---------------------- 32-bit floating point operations ----------------------
3035;
3036
3037;;
3038; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3039;
3040; @param A0 FPU context (fxsave).
3041; @param A1 Pointer to a IEMFPURESULT for the output.
3042; @param A2 Pointer to the 32-bit floating point value to convert.
3043;
3044BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3045 PROLOGUE_3_ARGS
3046 sub xSP, 20h
3047
3048 fninit
3049 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3050 fld dword [A2]
3051
3052 fnstsw word [A1 + IEMFPURESULT.FSW]
3053 fnclex
3054 fstp tword [A1 + IEMFPURESULT.r80Result]
3055
3056 fninit
3057 add xSP, 20h
3058 EPILOGUE_3_ARGS
3059ENDPROC iemAImpl_fld_r80_from_r32
3060
3061
3062;;
3063; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3064;
3065; @param A0 FPU context (fxsave).
3066; @param A1 Where to return the output FSW.
3067; @param A2 Where to store the 32-bit value.
3068; @param A3 Pointer to the 80-bit value.
3069;
3070BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3071 PROLOGUE_4_ARGS
3072 sub xSP, 20h
3073
3074 fninit
3075 fld tword [A3]
3076 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3077 fst dword [A2]
3078
3079 fnstsw word [A1]
3080
3081 fninit
3082 add xSP, 20h
3083 EPILOGUE_4_ARGS
3084ENDPROC iemAImpl_fst_r80_to_r32
3085
3086
3087;;
3088; FPU instruction working on one 80-bit and one 32-bit floating point value.
3089;
3090; @param 1 The instruction
3091;
3092; @param A0 FPU context (fxsave).
3093; @param A1 Pointer to a IEMFPURESULT for the output.
3094; @param A2 Pointer to the 80-bit value.
3095; @param A3 Pointer to the 32-bit value.
3096;
3097%macro IEMIMPL_FPU_R80_BY_R32 1
3098BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3099 PROLOGUE_4_ARGS
3100 sub xSP, 20h
3101
3102 fninit
3103 fld tword [A2]
3104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3105 %1 dword [A3]
3106
3107 fnstsw word [A1 + IEMFPURESULT.FSW]
3108 fnclex
3109 fstp tword [A1 + IEMFPURESULT.r80Result]
3110
3111 fninit
3112 add xSP, 20h
3113 EPILOGUE_4_ARGS
3114ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3115%endmacro
3116
3117IEMIMPL_FPU_R80_BY_R32 fadd
3118IEMIMPL_FPU_R80_BY_R32 fmul
3119IEMIMPL_FPU_R80_BY_R32 fsub
3120IEMIMPL_FPU_R80_BY_R32 fsubr
3121IEMIMPL_FPU_R80_BY_R32 fdiv
3122IEMIMPL_FPU_R80_BY_R32 fdivr
3123
3124
3125;;
3126; FPU instruction working on one 80-bit and one 32-bit floating point value,
3127; only returning FSW.
3128;
3129; @param 1 The instruction
3130;
3131; @param A0 FPU context (fxsave).
3132; @param A1 Where to store the output FSW.
3133; @param A2 Pointer to the 80-bit value.
3134; @param A3 Pointer to the 64-bit value.
3135;
3136%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3137BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3138 PROLOGUE_4_ARGS
3139 sub xSP, 20h
3140
3141 fninit
3142 fld tword [A2]
3143 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3144 %1 dword [A3]
3145
3146 fnstsw word [A1]
3147
3148 fninit
3149 add xSP, 20h
3150 EPILOGUE_4_ARGS
3151ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3152%endmacro
3153
3154IEMIMPL_FPU_R80_BY_R32_FSW fcom
3155
3156
3157
3158;
3159;---------------------- 64-bit floating point operations ----------------------
3160;
3161
3162;;
3163; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3164;
3165; @param A0 FPU context (fxsave).
3166; @param A1 Pointer to a IEMFPURESULT for the output.
3167; @param A2 Pointer to the 64-bit floating point value to convert.
3168;
3169BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3170 PROLOGUE_3_ARGS
3171 sub xSP, 20h
3172
3173 fninit
3174 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3175 fld qword [A2]
3176
3177 fnstsw word [A1 + IEMFPURESULT.FSW]
3178 fnclex
3179 fstp tword [A1 + IEMFPURESULT.r80Result]
3180
3181 fninit
3182 add xSP, 20h
3183 EPILOGUE_3_ARGS
3184ENDPROC iemAImpl_fld_r80_from_r64
3185
3186
3187;;
3188; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3189;
3190; @param A0 FPU context (fxsave).
3191; @param A1 Where to return the output FSW.
3192; @param A2 Where to store the 64-bit value.
3193; @param A3 Pointer to the 80-bit value.
3194;
3195BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3196 PROLOGUE_4_ARGS
3197 sub xSP, 20h
3198
3199 fninit
3200 fld tword [A3]
3201 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3202 fst qword [A2]
3203
3204 fnstsw word [A1]
3205
3206 fninit
3207 add xSP, 20h
3208 EPILOGUE_4_ARGS
3209ENDPROC iemAImpl_fst_r80_to_r64
3210
3211
3212;;
3213; FPU instruction working on one 80-bit and one 64-bit floating point value.
3214;
3215; @param 1 The instruction
3216;
3217; @param A0 FPU context (fxsave).
3218; @param A1 Pointer to a IEMFPURESULT for the output.
3219; @param A2 Pointer to the 80-bit value.
3220; @param A3 Pointer to the 64-bit value.
3221;
3222%macro IEMIMPL_FPU_R80_BY_R64 1
3223BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3224 PROLOGUE_4_ARGS
3225 sub xSP, 20h
3226
3227 fninit
3228 fld tword [A2]
3229 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3230 %1 qword [A3]
3231
3232 fnstsw word [A1 + IEMFPURESULT.FSW]
3233 fnclex
3234 fstp tword [A1 + IEMFPURESULT.r80Result]
3235
3236 fninit
3237 add xSP, 20h
3238 EPILOGUE_4_ARGS
3239ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3240%endmacro
3241
3242IEMIMPL_FPU_R80_BY_R64 fadd
3243IEMIMPL_FPU_R80_BY_R64 fmul
3244IEMIMPL_FPU_R80_BY_R64 fsub
3245IEMIMPL_FPU_R80_BY_R64 fsubr
3246IEMIMPL_FPU_R80_BY_R64 fdiv
3247IEMIMPL_FPU_R80_BY_R64 fdivr
3248
3249;;
3250; FPU instruction working on one 80-bit and one 64-bit floating point value,
3251; only returning FSW.
3252;
3253; @param 1 The instruction
3254;
3255; @param A0 FPU context (fxsave).
3256; @param A1 Where to store the output FSW.
3257; @param A2 Pointer to the 80-bit value.
3258; @param A3 Pointer to the 64-bit value.
3259;
3260%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3261BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3262 PROLOGUE_4_ARGS
3263 sub xSP, 20h
3264
3265 fninit
3266 fld tword [A2]
3267 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3268 %1 qword [A3]
3269
3270 fnstsw word [A1]
3271
3272 fninit
3273 add xSP, 20h
3274 EPILOGUE_4_ARGS
3275ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3276%endmacro
3277
3278IEMIMPL_FPU_R80_BY_R64_FSW fcom
3279
3280
3281
3282;
3283;---------------------- 80-bit floating point operations ----------------------
3284;
3285
3286;;
3287; Loads a 80-bit floating point register value from memory.
3288;
3289; @param A0 FPU context (fxsave).
3290; @param A1 Pointer to a IEMFPURESULT for the output.
3291; @param A2 Pointer to the 80-bit floating point value to load.
3292;
3293BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3294 PROLOGUE_3_ARGS
3295 sub xSP, 20h
3296
3297 fninit
3298 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3299 fld tword [A2]
3300
3301 fnstsw word [A1 + IEMFPURESULT.FSW]
3302 fnclex
3303 fstp tword [A1 + IEMFPURESULT.r80Result]
3304
3305 fninit
3306 add xSP, 20h
3307 EPILOGUE_3_ARGS
3308ENDPROC iemAImpl_fld_r80_from_r80
3309
3310
3311;;
3312; Store a 80-bit floating point register to memory
3313;
3314; @param A0 FPU context (fxsave).
3315; @param A1 Where to return the output FSW.
3316; @param A2 Where to store the 80-bit value.
3317; @param A3 Pointer to the 80-bit register value.
3318;
3319BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3320 PROLOGUE_4_ARGS
3321 sub xSP, 20h
3322
3323 fninit
3324 fld tword [A3]
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fstp tword [A2]
3327
3328 fnstsw word [A1]
3329
3330 fninit
3331 add xSP, 20h
3332 EPILOGUE_4_ARGS
3333ENDPROC iemAImpl_fst_r80_to_r80
3334
3335
3336;;
3337; Loads an 80-bit floating point register value in BCD format from memory.
3338;
3339; @param A0 FPU context (fxsave).
3340; @param A1 Pointer to a IEMFPURESULT for the output.
3341; @param A2 Pointer to the 80-bit BCD value to load.
3342;
3343BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3344 PROLOGUE_3_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3349 fbld tword [A2]
3350
3351 fnstsw word [A1 + IEMFPURESULT.FSW]
3352 fnclex
3353 fstp tword [A1 + IEMFPURESULT.r80Result]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_3_ARGS
3358ENDPROC iemAImpl_fld_r80_from_d80
3359
3360
3361;;
3362; Store a 80-bit floating point register to memory as BCD
3363;
3364; @param A0 FPU context (fxsave).
3365; @param A1 Where to return the output FSW.
3366; @param A2 Where to store the 80-bit BCD value.
3367; @param A3 Pointer to the 80-bit register value.
3368;
3369BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3370 PROLOGUE_4_ARGS
3371 sub xSP, 20h
3372
3373 fninit
3374 fld tword [A3]
3375 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3376 fbstp tword [A2]
3377
3378 fnstsw word [A1]
3379
3380 fninit
3381 add xSP, 20h
3382 EPILOGUE_4_ARGS
3383ENDPROC iemAImpl_fst_r80_to_d80
3384
3385
3386;;
3387; FPU instruction working on two 80-bit floating point values.
3388;
3389; @param 1 The instruction
3390;
3391; @param A0 FPU context (fxsave).
3392; @param A1 Pointer to a IEMFPURESULT for the output.
3393; @param A2 Pointer to the first 80-bit value (ST0)
3394; @param A3 Pointer to the second 80-bit value (STn).
3395;
3396%macro IEMIMPL_FPU_R80_BY_R80 2
3397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3398 PROLOGUE_4_ARGS
3399 sub xSP, 20h
3400
3401 fninit
3402 fld tword [A3]
3403 fld tword [A2]
3404 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3405 %1 %2
3406
3407 fnstsw word [A1 + IEMFPURESULT.FSW]
3408 fnclex
3409 fstp tword [A1 + IEMFPURESULT.r80Result]
3410
3411 fninit
3412 add xSP, 20h
3413 EPILOGUE_4_ARGS
3414ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3415%endmacro
3416
3417IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3418IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3419IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3420IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3421IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3422IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3423IEMIMPL_FPU_R80_BY_R80 fprem, {}
3424IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3425IEMIMPL_FPU_R80_BY_R80 fscale, {}
3426
3427
3428;;
3429; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3430; storing the result in ST1 and popping the stack.
3431;
3432; @param 1 The instruction
3433;
3434; @param A0 FPU context (fxsave).
3435; @param A1 Pointer to a IEMFPURESULT for the output.
3436; @param A2 Pointer to the first 80-bit value (ST1).
3437; @param A3 Pointer to the second 80-bit value (ST0).
3438;
3439%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3440BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3441 PROLOGUE_4_ARGS
3442 sub xSP, 20h
3443
3444 fninit
3445 fld tword [A2]
3446 fld tword [A3]
3447 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3448 %1
3449
3450 fnstsw word [A1 + IEMFPURESULT.FSW]
3451 fnclex
3452 fstp tword [A1 + IEMFPURESULT.r80Result]
3453
3454 fninit
3455 add xSP, 20h
3456 EPILOGUE_4_ARGS
3457ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3458%endmacro
3459
3460IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3461IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3462IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3463
3464
3465;;
3466; FPU instruction working on two 80-bit floating point values, only
3467; returning FSW.
3468;
3469; @param 1 The instruction
3470;
3471; @param A0 FPU context (fxsave).
3472; @param A1 Pointer to a uint16_t for the resulting FSW.
3473; @param A2 Pointer to the first 80-bit value.
3474; @param A3 Pointer to the second 80-bit value.
3475;
3476%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3478 PROLOGUE_4_ARGS
3479 sub xSP, 20h
3480
3481 fninit
3482 fld tword [A3]
3483 fld tword [A2]
3484 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3485 %1 st0, st1
3486
3487 fnstsw word [A1]
3488
3489 fninit
3490 add xSP, 20h
3491 EPILOGUE_4_ARGS
3492ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3493%endmacro
3494
3495IEMIMPL_FPU_R80_BY_R80_FSW fcom
3496IEMIMPL_FPU_R80_BY_R80_FSW fucom
3497
3498
3499;;
3500; FPU instruction working on two 80-bit floating point values,
3501; returning FSW and EFLAGS (eax).
3502;
3503; @param 1 The instruction
3504;
3505; @returns EFLAGS in EAX.
3506; @param A0 FPU context (fxsave).
3507; @param A1 Pointer to a uint16_t for the resulting FSW.
3508; @param A2 Pointer to the first 80-bit value.
3509; @param A3 Pointer to the second 80-bit value.
3510;
3511%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3512BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3513 PROLOGUE_4_ARGS
3514 sub xSP, 20h
3515
3516 fninit
3517 fld tword [A3]
3518 fld tword [A2]
3519 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3520 %1 st1
3521
3522 fnstsw word [A1]
3523 pushf
3524 pop xAX
3525
3526 fninit
3527 add xSP, 20h
3528 EPILOGUE_4_ARGS
3529ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3530%endmacro
3531
3532IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3533IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3534
3535
3536;;
3537; FPU instruction working on one 80-bit floating point value.
3538;
3539; @param 1 The instruction
3540;
3541; @param A0 FPU context (fxsave).
3542; @param A1 Pointer to a IEMFPURESULT for the output.
3543; @param A2 Pointer to the 80-bit value.
3544;
3545%macro IEMIMPL_FPU_R80 1
3546BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3547 PROLOGUE_3_ARGS
3548 sub xSP, 20h
3549
3550 fninit
3551 fld tword [A2]
3552 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3553 %1
3554
3555 fnstsw word [A1 + IEMFPURESULT.FSW]
3556 fnclex
3557 fstp tword [A1 + IEMFPURESULT.r80Result]
3558
3559 fninit
3560 add xSP, 20h
3561 EPILOGUE_3_ARGS
3562ENDPROC iemAImpl_ %+ %1 %+ _r80
3563%endmacro
3564
3565IEMIMPL_FPU_R80 fchs
3566IEMIMPL_FPU_R80 fabs
3567IEMIMPL_FPU_R80 f2xm1
3568IEMIMPL_FPU_R80 fsqrt
3569IEMIMPL_FPU_R80 frndint
3570IEMIMPL_FPU_R80 fsin
3571IEMIMPL_FPU_R80 fcos
3572
3573
3574;;
3575; FPU instruction working on one 80-bit floating point value, only
3576; returning FSW.
3577;
3578; @param 1 The instruction
3579; @param 2 Non-zero to also restore FTW.
3580;
3581; @param A0 FPU context (fxsave).
3582; @param A1 Pointer to a uint16_t for the resulting FSW.
3583; @param A2 Pointer to the 80-bit value.
3584;
3585%macro IEMIMPL_FPU_R80_FSW 2
3586BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3587 PROLOGUE_3_ARGS
3588 sub xSP, 20h
3589
3590 fninit
3591 fld tword [A2]
3592%if %2 != 0
3593 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3594%else
3595 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3596%endif
3597 %1
3598
3599 fnstsw word [A1]
3600
3601 fninit
3602 add xSP, 20h
3603 EPILOGUE_3_ARGS
3604ENDPROC iemAImpl_ %+ %1 %+ _r80
3605%endmacro
3606
3607IEMIMPL_FPU_R80_FSW ftst, 0
3608IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3609
3610
3611
3612;;
3613; FPU instruction loading a 80-bit floating point constant.
3614;
3615; @param 1 The instruction
3616;
3617; @param A0 FPU context (fxsave).
3618; @param A1 Pointer to a IEMFPURESULT for the output.
3619;
3620%macro IEMIMPL_FPU_R80_CONST 1
3621BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3622 PROLOGUE_2_ARGS
3623 sub xSP, 20h
3624
3625 fninit
3626 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3627 %1
3628
3629 fnstsw word [A1 + IEMFPURESULT.FSW]
3630 fnclex
3631 fstp tword [A1 + IEMFPURESULT.r80Result]
3632
3633 fninit
3634 add xSP, 20h
3635 EPILOGUE_2_ARGS
3636ENDPROC iemAImpl_ %+ %1 %+
3637%endmacro
3638
3639IEMIMPL_FPU_R80_CONST fld1
3640IEMIMPL_FPU_R80_CONST fldl2t
3641IEMIMPL_FPU_R80_CONST fldl2e
3642IEMIMPL_FPU_R80_CONST fldpi
3643IEMIMPL_FPU_R80_CONST fldlg2
3644IEMIMPL_FPU_R80_CONST fldln2
3645IEMIMPL_FPU_R80_CONST fldz
3646
3647
3648;;
3649; FPU instruction working on one 80-bit floating point value, outputing two.
3650;
3651; @param 1 The instruction
3652;
3653; @param A0 FPU context (fxsave).
3654; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3655; @param A2 Pointer to the 80-bit value.
3656;
3657%macro IEMIMPL_FPU_R80_R80 1
3658BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3659 PROLOGUE_3_ARGS
3660 sub xSP, 20h
3661
3662 fninit
3663 fld tword [A2]
3664 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3665 %1
3666
3667 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3668 fnclex
3669 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3670 fnclex
3671 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3672
3673 fninit
3674 add xSP, 20h
3675 EPILOGUE_3_ARGS
3676ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3677%endmacro
3678
3679IEMIMPL_FPU_R80_R80 fptan
3680IEMIMPL_FPU_R80_R80 fxtract
3681IEMIMPL_FPU_R80_R80 fsincos
3682
3683
3684
3685
3686;---------------------- SSE and MMX Operations ----------------------
3687
3688;; @todo what do we need to do for MMX?
3689%macro IEMIMPL_MMX_PROLOGUE 0
3690%endmacro
3691%macro IEMIMPL_MMX_EPILOGUE 0
3692%endmacro
3693
3694;; @todo what do we need to do for SSE?
3695%macro IEMIMPL_SSE_PROLOGUE 0
3696%endmacro
3697%macro IEMIMPL_SSE_EPILOGUE 0
3698%endmacro
3699
3700;; @todo what do we need to do for AVX?
3701%macro IEMIMPL_AVX_PROLOGUE 0
3702%endmacro
3703%macro IEMIMPL_AVX_EPILOGUE 0
3704%endmacro
3705
3706
3707;;
3708; Media instruction working on two full sized registers.
3709;
3710; @param 1 The instruction
3711; @param 2 Whether there is an MMX variant (1) or not (0).
3712;
3713; @param A0 FPU context (fxsave).
3714; @param A1 Pointer to the first media register size operand (input/output).
3715; @param A2 Pointer to the second media register size operand (input).
3716;
3717%macro IEMIMPL_MEDIA_F2 2
3718%if %2 != 0
3719BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3720 PROLOGUE_3_ARGS
3721 IEMIMPL_MMX_PROLOGUE
3722
3723 movq mm0, [A1]
3724 movq mm1, [A2]
3725 %1 mm0, mm1
3726 movq [A1], mm0
3727
3728 IEMIMPL_MMX_EPILOGUE
3729 EPILOGUE_3_ARGS
3730ENDPROC iemAImpl_ %+ %1 %+ _u64
3731%endif
3732
3733BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3734 PROLOGUE_3_ARGS
3735 IEMIMPL_SSE_PROLOGUE
3736
3737 movdqu xmm0, [A1]
3738 movdqu xmm1, [A2]
3739 %1 xmm0, xmm1
3740 movdqu [A1], xmm0
3741
3742 IEMIMPL_SSE_EPILOGUE
3743 EPILOGUE_3_ARGS
3744ENDPROC iemAImpl_ %+ %1 %+ _u128
3745%endmacro
3746
3747IEMIMPL_MEDIA_F2 pshufb, 1
3748IEMIMPL_MEDIA_F2 pand, 1
3749IEMIMPL_MEDIA_F2 pandn, 1
3750IEMIMPL_MEDIA_F2 por, 1
3751IEMIMPL_MEDIA_F2 pxor, 1
3752IEMIMPL_MEDIA_F2 pcmpeqb, 1
3753IEMIMPL_MEDIA_F2 pcmpeqw, 1
3754IEMIMPL_MEDIA_F2 pcmpeqd, 1
3755IEMIMPL_MEDIA_F2 pcmpeqq, 0
3756IEMIMPL_MEDIA_F2 pcmpgtb, 1
3757IEMIMPL_MEDIA_F2 pcmpgtw, 1
3758IEMIMPL_MEDIA_F2 pcmpgtd, 1
3759IEMIMPL_MEDIA_F2 pcmpgtq, 0
3760IEMIMPL_MEDIA_F2 paddb, 1
3761IEMIMPL_MEDIA_F2 paddw, 1
3762IEMIMPL_MEDIA_F2 paddd, 1
3763IEMIMPL_MEDIA_F2 paddq, 1
3764IEMIMPL_MEDIA_F2 paddsb, 1
3765IEMIMPL_MEDIA_F2 paddsw, 1
3766IEMIMPL_MEDIA_F2 paddusb, 1
3767IEMIMPL_MEDIA_F2 paddusw, 1
3768IEMIMPL_MEDIA_F2 psubb, 1
3769IEMIMPL_MEDIA_F2 psubw, 1
3770IEMIMPL_MEDIA_F2 psubd, 1
3771IEMIMPL_MEDIA_F2 psubq, 1
3772IEMIMPL_MEDIA_F2 psubsb, 1
3773IEMIMPL_MEDIA_F2 psubsw, 1
3774IEMIMPL_MEDIA_F2 psubusb, 1
3775IEMIMPL_MEDIA_F2 psubusw, 1
3776IEMIMPL_MEDIA_F2 pmullw, 1
3777IEMIMPL_MEDIA_F2 pmulld, 0
3778IEMIMPL_MEDIA_F2 pmulhw, 1
3779IEMIMPL_MEDIA_F2 pmaddwd, 1
3780IEMIMPL_MEDIA_F2 pminub, 1
3781IEMIMPL_MEDIA_F2 pminuw, 0
3782IEMIMPL_MEDIA_F2 pminud, 0
3783IEMIMPL_MEDIA_F2 pminsb, 0
3784IEMIMPL_MEDIA_F2 pminsw, 1
3785IEMIMPL_MEDIA_F2 pminsd, 0
3786IEMIMPL_MEDIA_F2 pmaxub, 1
3787IEMIMPL_MEDIA_F2 pmaxuw, 0
3788IEMIMPL_MEDIA_F2 pmaxud, 0
3789IEMIMPL_MEDIA_F2 pmaxsb, 0
3790IEMIMPL_MEDIA_F2 pmaxsw, 1
3791IEMIMPL_MEDIA_F2 pmaxsd, 0
3792IEMIMPL_MEDIA_F2 pabsb, 1
3793IEMIMPL_MEDIA_F2 pabsw, 1
3794IEMIMPL_MEDIA_F2 pabsd, 1
3795IEMIMPL_MEDIA_F2 psignb, 1
3796IEMIMPL_MEDIA_F2 psignw, 1
3797IEMIMPL_MEDIA_F2 psignd, 1
3798IEMIMPL_MEDIA_F2 phaddw, 1
3799IEMIMPL_MEDIA_F2 phaddd, 1
3800IEMIMPL_MEDIA_F2 phsubw, 1
3801IEMIMPL_MEDIA_F2 phsubd, 1
3802IEMIMPL_MEDIA_F2 phaddsw, 1
3803IEMIMPL_MEDIA_F2 phsubsw, 1
3804IEMIMPL_MEDIA_F2 pmaddubsw, 1
3805IEMIMPL_MEDIA_F2 pmulhrsw, 1
3806IEMIMPL_MEDIA_F2 pmuludq, 1
3807
3808
3809;;
3810; Media instruction working on two full sized registers, but no FXSAVE state argument.
3811;
3812; @param 1 The instruction
3813; @param 2 Whether there is an MMX variant (1) or not (0).
3814;
3815; @param A0 Pointer to the first media register size operand (input/output).
3816; @param A1 Pointer to the second media register size operand (input).
3817;
3818%macro IEMIMPL_MEDIA_OPT_F2 2
3819%if %2 != 0
3820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3821 PROLOGUE_2_ARGS
3822 IEMIMPL_MMX_PROLOGUE
3823
3824 movq mm0, [A0]
3825 movq mm1, [A1]
3826 %1 mm0, mm1
3827 movq [A0], mm0
3828
3829 IEMIMPL_MMX_EPILOGUE
3830 EPILOGUE_2_ARGS
3831ENDPROC iemAImpl_ %+ %1 %+ _u64
3832%endif
3833
3834BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3835 PROLOGUE_2_ARGS
3836 IEMIMPL_SSE_PROLOGUE
3837
3838 movdqu xmm0, [A0]
3839 movdqu xmm1, [A1]
3840 %1 xmm0, xmm1
3841 movdqu [A0], xmm0
3842
3843 IEMIMPL_SSE_EPILOGUE
3844 EPILOGUE_2_ARGS
3845ENDPROC iemAImpl_ %+ %1 %+ _u128
3846%endmacro
3847
3848IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3849IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3850IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3851IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3852IEMIMPL_MEDIA_OPT_F2 psllw, 1
3853IEMIMPL_MEDIA_OPT_F2 pslld, 1
3854IEMIMPL_MEDIA_OPT_F2 psllq, 1
3855IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3856IEMIMPL_MEDIA_OPT_F2 psrld, 1
3857IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3858IEMIMPL_MEDIA_OPT_F2 psraw, 1
3859IEMIMPL_MEDIA_OPT_F2 psrad, 1
3860IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3861IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3862IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3863IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3864IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3865IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3866IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3867IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3868IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3869IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3870IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3871IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3872IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3873IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3874IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3875IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3876IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3877IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3878IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3879IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3880
3881;;
3882; Media instruction working on one full sized and one half sized register (lower half).
3883;
3884; @param 1 The instruction
3885; @param 2 1 if MMX is included, 0 if not.
3886;
3887; @param A0 Pointer to the first full sized media register operand (input/output).
3888; @param A1 Pointer to the second half sized media register operand (input).
3889;
3890%macro IEMIMPL_MEDIA_F1L1 2
3891 %if %2 != 0
3892BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3893 PROLOGUE_2_ARGS
3894 IEMIMPL_MMX_PROLOGUE
3895
3896 movq mm0, [A0]
3897 movq mm1, [A1]
3898 %1 mm0, mm1
3899 movq [A0], mm0
3900
3901 IEMIMPL_MMX_EPILOGUE
3902 EPILOGUE_2_ARGS
3903ENDPROC iemAImpl_ %+ %1 %+ _u64
3904 %endif
3905
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3907 PROLOGUE_2_ARGS
3908 IEMIMPL_SSE_PROLOGUE
3909
3910 movdqu xmm0, [A0]
3911 movdqu xmm1, [A1]
3912 %1 xmm0, xmm1
3913 movdqu [A0], xmm0
3914
3915 IEMIMPL_SSE_EPILOGUE
3916 EPILOGUE_2_ARGS
3917ENDPROC iemAImpl_ %+ %1 %+ _u128
3918%endmacro
3919
3920IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3921IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3922IEMIMPL_MEDIA_F1L1 punpckldq, 1
3923IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3924
3925
3926;;
3927; Media instruction working two half sized input registers (lower half) and a full sized
3928; destination register (vpunpckh*).
3929;
3930; @param 1 The instruction
3931;
3932; @param A0 Pointer to the destination register (full sized, output only).
3933; @param A1 Pointer to the first full sized media source register operand, where we
3934; will only use the lower half as input - but we'll be loading it in full.
3935; @param A2 Pointer to the second full sized media source register operand, where we
3936; will only use the lower half as input - but we'll be loading it in full.
3937;
3938%macro IEMIMPL_MEDIA_F1L1L1 1
3939BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3940 PROLOGUE_3_ARGS
3941 IEMIMPL_AVX_PROLOGUE
3942
3943 vmovdqu xmm0, [A1]
3944 vmovdqu xmm1, [A2]
3945 %1 xmm0, xmm0, xmm1
3946 vmovdqu [A0], xmm0
3947
3948 IEMIMPL_AVX_PROLOGUE
3949 EPILOGUE_3_ARGS
3950ENDPROC iemAImpl_ %+ %1 %+ _u128
3951
3952BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3953 PROLOGUE_3_ARGS
3954 IEMIMPL_AVX_PROLOGUE
3955
3956 vmovdqu ymm0, [A1]
3957 vmovdqu ymm1, [A2]
3958 %1 ymm0, ymm0, ymm1
3959 vmovdqu [A0], ymm0
3960
3961 IEMIMPL_AVX_PROLOGUE
3962 EPILOGUE_3_ARGS
3963ENDPROC iemAImpl_ %+ %1 %+ _u256
3964%endmacro
3965
3966IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3967IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3968IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3969IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3970
3971
3972;;
3973; Media instruction working on one full sized and one half sized register (high half).
3974;
3975; @param 1 The instruction
3976; @param 2 1 if MMX is included, 0 if not.
3977;
3978; @param A0 Pointer to the first full sized media register operand (input/output).
3979; @param A1 Pointer to the second full sized media register operand, where we
3980; will only use the upper half as input - but we'll load it in full.
3981;
3982%macro IEMIMPL_MEDIA_F1H1 2
3983IEMIMPL_MEDIA_F1L1 %1, %2
3984%endmacro
3985
3986IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3987IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3988IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3989IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3990
3991
3992;;
3993; Media instruction working two half sized input registers (high half) and a full sized
3994; destination register (vpunpckh*).
3995;
3996; @param 1 The instruction
3997;
3998; @param A0 Pointer to the destination register (full sized, output only).
3999; @param A1 Pointer to the first full sized media source register operand, where we
4000; will only use the upper half as input - but we'll be loading it in full.
4001; @param A2 Pointer to the second full sized media source register operand, where we
4002; will only use the upper half as input - but we'll be loading it in full.
4003;
4004%macro IEMIMPL_MEDIA_F1H1H1 1
4005IEMIMPL_MEDIA_F1L1L1 %1
4006%endmacro
4007
4008IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4009IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4010IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4011IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4012
4013
4014;
4015; Shufflers with evil 8-bit immediates.
4016;
4017
4018BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4019 PROLOGUE_3_ARGS
4020 IEMIMPL_MMX_PROLOGUE
4021
4022 movq mm1, [A1]
4023 movq mm0, mm0 ; paranoia!
4024 lea T1, [.imm0 xWrtRIP]
4025 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4026 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4027 %else
4028 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4029 %endif
4030 lea T1, [T1 + T0]
4031 IBT_NOTRACK
4032 call T1
4033 movq [A0], mm0
4034
4035 IEMIMPL_MMX_EPILOGUE
4036 EPILOGUE_3_ARGS
4037%assign bImm 0
4038%rep 256
4039.imm %+ bImm:
4040 IBT_ENDBRxx_WITHOUT_NOTRACK
4041 pshufw mm0, mm1, bImm
4042 ret
4043 %assign bImm bImm + 1
4044%endrep
4045.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4046ENDPROC iemAImpl_pshufw_u64
4047
4048
4049%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4050BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4051 PROLOGUE_3_ARGS
4052 IEMIMPL_SSE_PROLOGUE
4053
4054 movdqu xmm1, [A1]
4055 movdqu xmm0, xmm1 ; paranoia!
4056 lea T1, [.imm0 xWrtRIP]
4057 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4058 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4059 %else
4060 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4061 %endif
4062 lea T1, [T1 + T0*2]
4063 IBT_NOTRACK
4064 call T1
4065 movdqu [A0], xmm0
4066
4067 IEMIMPL_SSE_EPILOGUE
4068 EPILOGUE_3_ARGS
4069
4070 %assign bImm 0
4071 %rep 256
4072.imm %+ bImm:
4073 IBT_ENDBRxx_WITHOUT_NOTRACK
4074 %1 xmm0, xmm1, bImm
4075 ret
4076 %assign bImm bImm + 1
4077 %endrep
4078.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4079ENDPROC iemAImpl_ %+ %1 %+ _u128
4080%endmacro
4081
4082IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4083IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4084IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4085
4086
4087%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4088BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4089 PROLOGUE_3_ARGS
4090 IEMIMPL_SSE_PROLOGUE
4091
4092 vmovdqu ymm1, [A1]
4093 vmovdqu ymm0, ymm1 ; paranoia!
4094 lea T1, [.imm0 xWrtRIP]
4095 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4096 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4097 %else
4098 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4099 %endif
4100 lea T1, [T1 + T0*2]
4101 IBT_NOTRACK
4102 call T1
4103 vmovdqu [A0], ymm0
4104
4105 IEMIMPL_SSE_EPILOGUE
4106 EPILOGUE_3_ARGS
4107 %assign bImm 0
4108 %rep 256
4109.imm %+ bImm:
4110 IBT_ENDBRxx_WITHOUT_NOTRACK
4111 %1 ymm0, ymm1, bImm
4112 ret
4113 %assign bImm bImm + 1
4114 %endrep
4115.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4116ENDPROC iemAImpl_ %+ %1 %+ _u256
4117%endmacro
4118
4119IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4120IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4121IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4122
4123
4124;
4125; Shifts with evil 8-bit immediates.
4126;
4127
4128%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4129BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4130 PROLOGUE_2_ARGS
4131 IEMIMPL_MMX_PROLOGUE
4132
4133 movq mm0, [A0]
4134 lea T1, [.imm0 xWrtRIP]
4135 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4136 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4137 %else
4138 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4139 %endif
4140 lea T1, [T1 + T0]
4141 IBT_NOTRACK
4142 call T1
4143 movq [A0], mm0
4144
4145 IEMIMPL_MMX_EPILOGUE
4146 EPILOGUE_2_ARGS
4147%assign bImm 0
4148%rep 256
4149.imm %+ bImm:
4150 IBT_ENDBRxx_WITHOUT_NOTRACK
4151 %1 mm0, bImm
4152 ret
4153 %assign bImm bImm + 1
4154%endrep
4155.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4156ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4157%endmacro
4158
4159IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4160IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4161IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4162IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4163IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4164IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4165IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4166IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4167
4168
4169%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4170BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4171 PROLOGUE_2_ARGS
4172 IEMIMPL_SSE_PROLOGUE
4173
4174 movdqu xmm0, [A0]
4175 lea T1, [.imm0 xWrtRIP]
4176 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4177 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4178 %else
4179 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4180 %endif
4181 lea T1, [T1 + T0*2]
4182 IBT_NOTRACK
4183 call T1
4184 movdqu [A0], xmm0
4185
4186 IEMIMPL_SSE_EPILOGUE
4187 EPILOGUE_2_ARGS
4188 %assign bImm 0
4189 %rep 256
4190.imm %+ bImm:
4191 IBT_ENDBRxx_WITHOUT_NOTRACK
4192 %1 xmm0, bImm
4193 ret
4194 %assign bImm bImm + 1
4195 %endrep
4196.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4197ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4198%endmacro
4199
4200IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4201IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4202IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4203IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4204IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4205IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4206IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4207IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4208IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4209IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4210
4211
4212;
4213; Move byte mask.
4214;
4215
4216BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4217 PROLOGUE_2_ARGS
4218 IEMIMPL_MMX_PROLOGUE
4219
4220 movq mm1, [A1]
4221 pmovmskb T0, mm1
4222 mov [A0], T0
4223%ifdef RT_ARCH_X86
4224 mov dword [A0 + 4], 0
4225%endif
4226 IEMIMPL_MMX_EPILOGUE
4227 EPILOGUE_2_ARGS
4228ENDPROC iemAImpl_pmovmskb_u64
4229
4230BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4231 PROLOGUE_2_ARGS
4232 IEMIMPL_SSE_PROLOGUE
4233
4234 movdqu xmm1, [A1]
4235 pmovmskb T0, xmm1
4236 mov [A0], T0
4237%ifdef RT_ARCH_X86
4238 mov dword [A0 + 4], 0
4239%endif
4240 IEMIMPL_SSE_EPILOGUE
4241 EPILOGUE_2_ARGS
4242ENDPROC iemAImpl_pmovmskb_u128
4243
4244BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4245 PROLOGUE_2_ARGS
4246 IEMIMPL_AVX_PROLOGUE
4247
4248 vmovdqu ymm1, [A1]
4249 vpmovmskb T0, ymm1
4250 mov [A0], T0
4251%ifdef RT_ARCH_X86
4252 mov dword [A0 + 4], 0
4253%endif
4254 IEMIMPL_AVX_EPILOGUE
4255 EPILOGUE_2_ARGS
4256ENDPROC iemAImpl_vpmovmskb_u256
4257
4258
4259;;
4260; Media instruction working on two full sized source registers and one destination (AVX).
4261;
4262; @param 1 The instruction
4263;
4264; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4265; @param A1 Pointer to the destination media register size operand (output).
4266; @param A2 Pointer to the first source media register size operand (input).
4267; @param A3 Pointer to the second source media register size operand (input).
4268;
4269%macro IEMIMPL_MEDIA_F3 1
4270BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4271 PROLOGUE_4_ARGS
4272 IEMIMPL_AVX_PROLOGUE
4273
4274 vmovdqu xmm0, [A2]
4275 vmovdqu xmm1, [A3]
4276 %1 xmm0, xmm0, xmm1
4277 vmovdqu [A1], xmm0
4278
4279 IEMIMPL_AVX_PROLOGUE
4280 EPILOGUE_4_ARGS
4281ENDPROC iemAImpl_ %+ %1 %+ _u128
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4284 PROLOGUE_4_ARGS
4285 IEMIMPL_AVX_PROLOGUE
4286
4287 vmovdqu ymm0, [A2]
4288 vmovdqu ymm1, [A3]
4289 %1 ymm0, ymm0, ymm1
4290 vmovdqu [A1], ymm0
4291
4292 IEMIMPL_AVX_PROLOGUE
4293 EPILOGUE_4_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u256
4295%endmacro
4296
4297IEMIMPL_MEDIA_F3 vpshufb
4298IEMIMPL_MEDIA_F3 vpand
4299IEMIMPL_MEDIA_F3 vpminub
4300IEMIMPL_MEDIA_F3 vpminuw
4301IEMIMPL_MEDIA_F3 vpminud
4302IEMIMPL_MEDIA_F3 vpminsb
4303IEMIMPL_MEDIA_F3 vpminsw
4304IEMIMPL_MEDIA_F3 vpminsd
4305IEMIMPL_MEDIA_F3 vpmaxub
4306IEMIMPL_MEDIA_F3 vpmaxuw
4307IEMIMPL_MEDIA_F3 vpmaxud
4308IEMIMPL_MEDIA_F3 vpmaxsb
4309IEMIMPL_MEDIA_F3 vpmaxsw
4310IEMIMPL_MEDIA_F3 vpmaxsd
4311IEMIMPL_MEDIA_F3 vpandn
4312IEMIMPL_MEDIA_F3 vpor
4313IEMIMPL_MEDIA_F3 vpxor
4314IEMIMPL_MEDIA_F3 vpcmpeqb
4315IEMIMPL_MEDIA_F3 vpcmpeqw
4316IEMIMPL_MEDIA_F3 vpcmpeqd
4317IEMIMPL_MEDIA_F3 vpcmpeqq
4318IEMIMPL_MEDIA_F3 vpcmpgtb
4319IEMIMPL_MEDIA_F3 vpcmpgtw
4320IEMIMPL_MEDIA_F3 vpcmpgtd
4321IEMIMPL_MEDIA_F3 vpcmpgtq
4322IEMIMPL_MEDIA_F3 vpaddb
4323IEMIMPL_MEDIA_F3 vpaddw
4324IEMIMPL_MEDIA_F3 vpaddd
4325IEMIMPL_MEDIA_F3 vpaddq
4326IEMIMPL_MEDIA_F3 vpsubb
4327IEMIMPL_MEDIA_F3 vpsubw
4328IEMIMPL_MEDIA_F3 vpsubd
4329IEMIMPL_MEDIA_F3 vpsubq
4330
4331
4332;;
4333; Media instruction working on two full sized source registers and one destination (AVX),
4334; but no XSAVE state pointer argument.
4335;
4336; @param 1 The instruction
4337;
4338; @param A0 Pointer to the destination media register size operand (output).
4339; @param A1 Pointer to the first source media register size operand (input).
4340; @param A2 Pointer to the second source media register size operand (input).
4341;
4342%macro IEMIMPL_MEDIA_OPT_F3 1
4343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4344 PROLOGUE_3_ARGS
4345 IEMIMPL_AVX_PROLOGUE
4346
4347 vmovdqu xmm0, [A1]
4348 vmovdqu xmm1, [A2]
4349 %1 xmm0, xmm0, xmm1
4350 vmovdqu [A0], xmm0
4351
4352 IEMIMPL_AVX_PROLOGUE
4353 EPILOGUE_3_ARGS
4354ENDPROC iemAImpl_ %+ %1 %+ _u128
4355
4356BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4357 PROLOGUE_3_ARGS
4358 IEMIMPL_AVX_PROLOGUE
4359
4360 vmovdqu ymm0, [A1]
4361 vmovdqu ymm1, [A2]
4362 %1 ymm0, ymm0, ymm1
4363 vmovdqu [A0], ymm0
4364
4365 IEMIMPL_AVX_PROLOGUE
4366 EPILOGUE_3_ARGS
4367ENDPROC iemAImpl_ %+ %1 %+ _u256
4368%endmacro
4369
4370IEMIMPL_MEDIA_OPT_F3 vpacksswb
4371IEMIMPL_MEDIA_OPT_F3 vpackssdw
4372IEMIMPL_MEDIA_OPT_F3 vpackuswb
4373IEMIMPL_MEDIA_OPT_F3 vpackusdw
4374IEMIMPL_MEDIA_OPT_F3 vpmullw
4375IEMIMPL_MEDIA_OPT_F3 vpmulld
4376IEMIMPL_MEDIA_OPT_F3 vpmulhw
4377IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4378IEMIMPL_MEDIA_OPT_F3 vpavgb
4379IEMIMPL_MEDIA_OPT_F3 vpavgw
4380IEMIMPL_MEDIA_OPT_F3 vpsignb
4381IEMIMPL_MEDIA_OPT_F3 vpsignw
4382IEMIMPL_MEDIA_OPT_F3 vpsignd
4383IEMIMPL_MEDIA_OPT_F3 vphaddw
4384IEMIMPL_MEDIA_OPT_F3 vphaddd
4385IEMIMPL_MEDIA_OPT_F3 vphsubw
4386IEMIMPL_MEDIA_OPT_F3 vphsubd
4387IEMIMPL_MEDIA_OPT_F3 vphaddsw
4388IEMIMPL_MEDIA_OPT_F3 vphsubsw
4389IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4390IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4391IEMIMPL_MEDIA_OPT_F3 vpsadbw
4392IEMIMPL_MEDIA_OPT_F3 vpmuldq
4393IEMIMPL_MEDIA_OPT_F3 vpmuludq
4394IEMIMPL_MEDIA_OPT_F3 vunpcklps
4395IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4396IEMIMPL_MEDIA_OPT_F3 vunpckhps
4397IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4398IEMIMPL_MEDIA_OPT_F3 vpsubsb
4399IEMIMPL_MEDIA_OPT_F3 vpsubsw
4400IEMIMPL_MEDIA_OPT_F3 vpsubusb
4401IEMIMPL_MEDIA_OPT_F3 vpsubusw
4402IEMIMPL_MEDIA_OPT_F3 vpaddusb
4403IEMIMPL_MEDIA_OPT_F3 vpaddusw
4404IEMIMPL_MEDIA_OPT_F3 vpaddsb
4405IEMIMPL_MEDIA_OPT_F3 vpaddsw
4406
4407
4408;;
4409; Media instruction working on one full sized source registers and one destination (AVX),
4410; but no XSAVE state pointer argument.
4411;
4412; @param 1 The instruction
4413; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4414;
4415; @param A0 Pointer to the destination media register size operand (output).
4416; @param A1 Pointer to the source media register size operand (input).
4417;
4418%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4419BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4420 PROLOGUE_2_ARGS
4421 IEMIMPL_AVX_PROLOGUE
4422
4423 vmovdqu xmm0, [A1]
4424 %1 xmm0, xmm0
4425 vmovdqu [A0], xmm0
4426
4427 IEMIMPL_AVX_PROLOGUE
4428 EPILOGUE_2_ARGS
4429ENDPROC iemAImpl_ %+ %1 %+ _u128
4430
4431 %if %2 == 1
4432BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4433 PROLOGUE_2_ARGS
4434 IEMIMPL_AVX_PROLOGUE
4435
4436 vmovdqu ymm0, [A1]
4437 %1 ymm0, ymm0
4438 vmovdqu [A0], ymm0
4439
4440 IEMIMPL_AVX_PROLOGUE
4441 EPILOGUE_2_ARGS
4442ENDPROC iemAImpl_ %+ %1 %+ _u256
4443 %endif
4444%endmacro
4445
4446IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4447IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4448IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4449IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4450
4451
4452;
4453; The SSE 4.2 crc32
4454;
4455; @param A1 Pointer to the 32-bit destination.
4456; @param A2 The source operand, sized according to the suffix.
4457;
4458BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4459 PROLOGUE_2_ARGS
4460
4461 mov T0_32, [A0]
4462 crc32 T0_32, A1_8
4463 mov [A0], T0_32
4464
4465 EPILOGUE_2_ARGS
4466ENDPROC iemAImpl_crc32_u8
4467
4468BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4469 PROLOGUE_2_ARGS
4470
4471 mov T0_32, [A0]
4472 crc32 T0_32, A1_16
4473 mov [A0], T0_32
4474
4475 EPILOGUE_2_ARGS
4476ENDPROC iemAImpl_crc32_u16
4477
4478BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4479 PROLOGUE_2_ARGS
4480
4481 mov T0_32, [A0]
4482 crc32 T0_32, A1_32
4483 mov [A0], T0_32
4484
4485 EPILOGUE_2_ARGS
4486ENDPROC iemAImpl_crc32_u32
4487
4488%ifdef RT_ARCH_AMD64
4489BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4490 PROLOGUE_2_ARGS
4491
4492 mov T0_32, [A0]
4493 crc32 T0, A1
4494 mov [A0], T0_32
4495
4496 EPILOGUE_2_ARGS
4497ENDPROC iemAImpl_crc32_u64
4498%endif
4499
4500
4501;
4502; PTEST (SSE 4.1)
4503;
4504; @param A0 Pointer to the first source operand (aka readonly destination).
4505; @param A1 Pointer to the second source operand.
4506; @param A2 Pointer to the EFLAGS register.
4507;
4508BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4509 PROLOGUE_3_ARGS
4510 IEMIMPL_SSE_PROLOGUE
4511
4512 movdqu xmm0, [A0]
4513 movdqu xmm1, [A1]
4514 ptest xmm0, xmm1
4515 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4516
4517 IEMIMPL_SSE_EPILOGUE
4518 EPILOGUE_3_ARGS
4519ENDPROC iemAImpl_ptest_u128
4520
4521BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4522 PROLOGUE_3_ARGS
4523 IEMIMPL_SSE_PROLOGUE
4524
4525 vmovdqu ymm0, [A0]
4526 vmovdqu ymm1, [A1]
4527 vptest ymm0, ymm1
4528 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4529
4530 IEMIMPL_SSE_EPILOGUE
4531 EPILOGUE_3_ARGS
4532ENDPROC iemAImpl_vptest_u256
4533
4534
4535;;
4536; Template for the [v]pmov{s,z}x* instructions
4537;
4538; @param 1 The instruction
4539;
4540; @param A0 Pointer to the destination media register size operand (output).
4541; @param A1 The source operand value (input).
4542;
4543%macro IEMIMPL_V_PMOV_SZ_X 1
4544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4545 PROLOGUE_2_ARGS
4546 IEMIMPL_SSE_PROLOGUE
4547
4548 movd xmm0, A1
4549 %1 xmm0, xmm0
4550 vmovdqu [A0], xmm0
4551
4552 IEMIMPL_SSE_PROLOGUE
4553 EPILOGUE_2_ARGS
4554ENDPROC iemAImpl_ %+ %1 %+ _u128
4555
4556BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4557 PROLOGUE_2_ARGS
4558 IEMIMPL_AVX_PROLOGUE
4559
4560 movd xmm0, A1
4561 v %+ %1 xmm0, xmm0
4562 vmovdqu [A0], xmm0
4563
4564 IEMIMPL_AVX_PROLOGUE
4565 EPILOGUE_2_ARGS
4566ENDPROC iemAImpl_v %+ %1 %+ _u128
4567
4568BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4569 PROLOGUE_2_ARGS
4570 IEMIMPL_AVX_PROLOGUE
4571
4572 movdqu xmm0, [A1]
4573 v %+ %1 ymm0, xmm0
4574 vmovdqu [A0], ymm0
4575
4576 IEMIMPL_AVX_PROLOGUE
4577 EPILOGUE_2_ARGS
4578ENDPROC iemAImpl_v %+ %1 %+ _u256
4579%endmacro
4580
4581IEMIMPL_V_PMOV_SZ_X pmovsxbw
4582IEMIMPL_V_PMOV_SZ_X pmovsxbd
4583IEMIMPL_V_PMOV_SZ_X pmovsxbq
4584IEMIMPL_V_PMOV_SZ_X pmovsxwd
4585IEMIMPL_V_PMOV_SZ_X pmovsxwq
4586IEMIMPL_V_PMOV_SZ_X pmovsxdq
4587
4588IEMIMPL_V_PMOV_SZ_X pmovzxbw
4589IEMIMPL_V_PMOV_SZ_X pmovzxbd
4590IEMIMPL_V_PMOV_SZ_X pmovzxbq
4591IEMIMPL_V_PMOV_SZ_X pmovzxwd
4592IEMIMPL_V_PMOV_SZ_X pmovzxwq
4593IEMIMPL_V_PMOV_SZ_X pmovzxdq
4594
4595
4596;;
4597; Need to move this as well somewhere better?
4598;
4599struc IEMSSERESULT
4600 .uResult resd 4
4601 .MXCSR resd 1
4602endstruc
4603
4604
4605;;
4606; Need to move this as well somewhere better?
4607;
4608struc IEMAVX128RESULT
4609 .uResult resd 4
4610 .MXCSR resd 1
4611endstruc
4612
4613
4614;;
4615; Need to move this as well somewhere better?
4616;
4617struc IEMAVX256RESULT
4618 .uResult resd 8
4619 .MXCSR resd 1
4620endstruc
4621
4622
4623;;
4624; Initialize the SSE MXCSR register using the guest value partially to
4625; account for rounding mode.
4626;
4627; @uses 4 bytes of stack to save the original value, T0.
4628; @param 1 Expression giving the address of the FXSTATE of the guest.
4629;
4630%macro SSE_LD_FXSTATE_MXCSR 1
4631 sub xSP, 4
4632
4633 stmxcsr [xSP]
4634 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4635 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4636 or T0_32, X86_MXCSR_XCPT_MASK
4637 sub xSP, 4
4638 mov [xSP], T0_32
4639 ldmxcsr [xSP]
4640 add xSP, 4
4641%endmacro
4642
4643
4644;;
4645; Restores the SSE MXCSR register with the original value.
4646;
4647; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4648; @param 1 Expression giving the address where to return the MXCSR value.
4649; @param 2 Expression giving the address of the FXSTATE of the guest.
4650;
4651; @note Restores the stack pointer.
4652;
4653%macro SSE_ST_FXSTATE_MXCSR 2
4654 sub xSP, 4
4655 stmxcsr [xSP]
4656 mov T0_32, [xSP]
4657 add xSP, 4
4658 ; Merge the status bits into the original MXCSR value.
4659 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4660 and T0_32, X86_MXCSR_XCPT_FLAGS
4661 or T0_32, T1_32
4662 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4663
4664 ldmxcsr [xSP]
4665 add xSP, 4
4666%endmacro
4667
4668
4669;;
4670; Initialize the SSE MXCSR register using the guest value partially to
4671; account for rounding mode.
4672;
4673; @uses 4 bytes of stack to save the original value.
4674; @param 1 Expression giving the address of the FXSTATE of the guest.
4675;
4676%macro AVX_LD_XSAVEAREA_MXCSR 1
4677 sub xSP, 4
4678
4679 stmxcsr [xSP]
4680 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4681 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4682 sub xSP, 4
4683 mov [xSP], T0_32
4684 ldmxcsr [xSP]
4685 add xSP, 4
4686%endmacro
4687
4688
4689;;
4690; Restores the AVX128 MXCSR register with the original value.
4691;
4692; @param 1 Expression giving the address where to return the MXCSR value.
4693;
4694; @note Restores the stack pointer.
4695;
4696%macro AVX128_ST_XSAVEAREA_MXCSR 1
4697 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4698
4699 ldmxcsr [xSP]
4700 add xSP, 4
4701%endmacro
4702
4703
4704;;
4705; Restores the AVX256 MXCSR register with the original value.
4706;
4707; @param 1 Expression giving the address where to return the MXCSR value.
4708;
4709; @note Restores the stack pointer.
4710;
4711%macro AVX256_ST_XSAVEAREA_MXCSR 1
4712 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4713
4714 ldmxcsr [xSP]
4715 add xSP, 4
4716%endmacro
4717
4718
4719;;
4720; Floating point instruction working on two full sized registers.
4721;
4722; @param 1 The instruction
4723; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4724;
4725; @param A0 FPU context (FXSTATE or XSAVEAREA).
4726; @param A1 Where to return the result including the MXCSR value.
4727; @param A2 Pointer to the first media register size operand (input/output).
4728; @param A3 Pointer to the second media register size operand (input).
4729;
4730%macro IEMIMPL_FP_F2 2
4731BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4732 PROLOGUE_4_ARGS
4733 IEMIMPL_SSE_PROLOGUE
4734 SSE_LD_FXSTATE_MXCSR A0
4735
4736 movdqu xmm0, [A2]
4737 movdqu xmm1, [A3]
4738 %1 xmm0, xmm1
4739 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4740
4741 SSE_ST_FXSTATE_MXCSR A1, A0
4742 IEMIMPL_SSE_PROLOGUE
4743 EPILOGUE_4_ARGS
4744ENDPROC iemAImpl_ %+ %1 %+ _u128
4745
4746 %if %2 == 3
4747BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4748 PROLOGUE_4_ARGS
4749 IEMIMPL_AVX_PROLOGUE
4750 AVX_LD_XSAVEAREA_MXCSR A0
4751
4752 vmovdqu xmm0, [A2]
4753 vmovdqu xmm1, [A3]
4754 v %+ %1 xmm0, xmm0, xmm1
4755 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4756
4757 AVX128_ST_XSAVEAREA_MXCSR A1
4758 IEMIMPL_AVX_PROLOGUE
4759 EPILOGUE_4_ARGS
4760ENDPROC iemAImpl_v %+ %1 %+ _u128
4761
4762BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4763 PROLOGUE_4_ARGS
4764 IEMIMPL_AVX_PROLOGUE
4765 AVX_LD_XSAVEAREA_MXCSR A0
4766
4767 vmovdqu ymm0, [A2]
4768 vmovdqu ymm1, [A3]
4769 v %+ %1 ymm0, ymm0, ymm1
4770 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4771
4772 AVX256_ST_XSAVEAREA_MXCSR A1
4773 IEMIMPL_AVX_PROLOGUE
4774 EPILOGUE_4_ARGS
4775ENDPROC iemAImpl_v %+ %1 %+ _u256
4776 %elif %2 == 2
4777BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4778 PROLOGUE_4_ARGS
4779 IEMIMPL_AVX_PROLOGUE
4780 AVX_LD_XSAVEAREA_MXCSR A0
4781
4782 vmovdqu xmm0, [A2]
4783 vmovdqu xmm1, [A3]
4784 v %+ %1 xmm0, xmm1
4785 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4786
4787 AVX128_ST_XSAVEAREA_MXCSR A1
4788 IEMIMPL_AVX_PROLOGUE
4789 EPILOGUE_4_ARGS
4790ENDPROC iemAImpl_v %+ %1 %+ _u128
4791
4792BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4793 PROLOGUE_4_ARGS
4794 IEMIMPL_AVX_PROLOGUE
4795 AVX_LD_XSAVEAREA_MXCSR A0
4796
4797 vmovdqu ymm0, [A2]
4798 vmovdqu ymm1, [A3]
4799 v %+ %1 ymm0, ymm1
4800 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4801
4802 AVX256_ST_XSAVEAREA_MXCSR A1
4803 IEMIMPL_AVX_PROLOGUE
4804 EPILOGUE_4_ARGS
4805ENDPROC iemAImpl_v %+ %1 %+ _u256
4806 %endif
4807%endmacro
4808
4809IEMIMPL_FP_F2 addps, 3
4810IEMIMPL_FP_F2 addpd, 3
4811IEMIMPL_FP_F2 mulps, 3
4812IEMIMPL_FP_F2 mulpd, 3
4813IEMIMPL_FP_F2 subps, 3
4814IEMIMPL_FP_F2 subpd, 3
4815IEMIMPL_FP_F2 minps, 3
4816IEMIMPL_FP_F2 minpd, 3
4817IEMIMPL_FP_F2 divps, 3
4818IEMIMPL_FP_F2 divpd, 3
4819IEMIMPL_FP_F2 maxps, 3
4820IEMIMPL_FP_F2 maxpd, 3
4821IEMIMPL_FP_F2 haddps, 3
4822IEMIMPL_FP_F2 haddpd, 3
4823IEMIMPL_FP_F2 hsubps, 3
4824IEMIMPL_FP_F2 hsubpd, 3
4825IEMIMPL_FP_F2 addsubps, 3
4826IEMIMPL_FP_F2 addsubpd, 3
4827
4828
4829;;
4830; These are actually unary operations but to keep it simple
4831; we treat them as binary for now, so the output result is
4832; always in sync with the register where the result might get written
4833; to.
4834IEMIMPL_FP_F2 sqrtps, 2
4835IEMIMPL_FP_F2 rsqrtps, 2
4836IEMIMPL_FP_F2 sqrtpd, 2
4837IEMIMPL_FP_F2 cvtdq2ps, 2
4838IEMIMPL_FP_F2 cvtps2dq, 2
4839IEMIMPL_FP_F2 cvttps2dq, 2
4840IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4841IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4842IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4843
4844
4845;;
4846; Floating point instruction working on a full sized register and a single precision operand.
4847;
4848; @param 1 The instruction
4849;
4850; @param A0 FPU context (FXSTATE or XSAVEAREA).
4851; @param A1 Where to return the result including the MXCSR value.
4852; @param A2 Pointer to the first media register size operand (input/output).
4853; @param A3 Pointer to the second single precision floating point value (input).
4854;
4855%macro IEMIMPL_FP_F2_R32 1
4856BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4857 PROLOGUE_4_ARGS
4858 IEMIMPL_SSE_PROLOGUE
4859 SSE_LD_FXSTATE_MXCSR A0
4860
4861 movdqu xmm0, [A2]
4862 movd xmm1, [A3]
4863 %1 xmm0, xmm1
4864 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4865
4866 SSE_ST_FXSTATE_MXCSR A1, A0
4867 IEMIMPL_SSE_EPILOGUE
4868 EPILOGUE_4_ARGS
4869ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4870
4871BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4872 PROLOGUE_4_ARGS
4873 IEMIMPL_AVX_PROLOGUE
4874 AVX_LD_XSAVEAREA_MXCSR A0
4875
4876 vmovdqu xmm0, [A2]
4877 vmovd xmm1, [A3]
4878 v %+ %1 xmm0, xmm0, xmm1
4879 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4880
4881 AVX128_ST_XSAVEAREA_MXCSR A1
4882 IEMIMPL_AVX_PROLOGUE
4883 EPILOGUE_4_ARGS
4884ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4885%endmacro
4886
4887IEMIMPL_FP_F2_R32 addss
4888IEMIMPL_FP_F2_R32 mulss
4889IEMIMPL_FP_F2_R32 subss
4890IEMIMPL_FP_F2_R32 minss
4891IEMIMPL_FP_F2_R32 divss
4892IEMIMPL_FP_F2_R32 maxss
4893IEMIMPL_FP_F2_R32 cvtss2sd
4894IEMIMPL_FP_F2_R32 sqrtss
4895IEMIMPL_FP_F2_R32 rsqrtss
4896
4897
4898;;
4899; Floating point instruction working on a full sized register and a double precision operand.
4900;
4901; @param 1 The instruction
4902;
4903; @param A0 FPU context (FXSTATE or XSAVEAREA).
4904; @param A1 Where to return the result including the MXCSR value.
4905; @param A2 Pointer to the first media register size operand (input/output).
4906; @param A3 Pointer to the second double precision floating point value (input).
4907;
4908%macro IEMIMPL_FP_F2_R64 1
4909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4910 PROLOGUE_4_ARGS
4911 IEMIMPL_SSE_PROLOGUE
4912 SSE_LD_FXSTATE_MXCSR A0
4913
4914 movdqu xmm0, [A2]
4915 movq xmm1, [A3]
4916 %1 xmm0, xmm1
4917 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4918
4919 SSE_ST_FXSTATE_MXCSR A1, A0
4920 IEMIMPL_SSE_EPILOGUE
4921 EPILOGUE_4_ARGS
4922ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4923
4924BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4925 PROLOGUE_4_ARGS
4926 IEMIMPL_AVX_PROLOGUE
4927 AVX_LD_XSAVEAREA_MXCSR A0
4928
4929 vmovdqu xmm0, [A2]
4930 vmovq xmm1, [A3]
4931 v %+ %1 xmm0, xmm0, xmm1
4932 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4933
4934 AVX128_ST_XSAVEAREA_MXCSR A1
4935 IEMIMPL_AVX_EPILOGUE
4936 EPILOGUE_4_ARGS
4937ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4938%endmacro
4939
4940IEMIMPL_FP_F2_R64 addsd
4941IEMIMPL_FP_F2_R64 mulsd
4942IEMIMPL_FP_F2_R64 subsd
4943IEMIMPL_FP_F2_R64 minsd
4944IEMIMPL_FP_F2_R64 divsd
4945IEMIMPL_FP_F2_R64 maxsd
4946IEMIMPL_FP_F2_R64 cvtsd2ss
4947IEMIMPL_FP_F2_R64 sqrtsd
4948
4949
4950;;
4951; Macro for the cvtpd2ps/cvtps2pd instructions.
4952;
4953; 1 The instruction name.
4954; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4955;
4956; @param A0 FPU context (FXSTATE or XSAVEAREA).
4957; @param A1 Where to return the result including the MXCSR value.
4958; @param A2 Pointer to the first media register size operand (input/output).
4959; @param A3 Pointer to the second media register size operand (input).
4960;
4961%macro IEMIMPL_CVT_F2 2
4962BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4963 PROLOGUE_4_ARGS
4964 IEMIMPL_SSE_PROLOGUE
4965 SSE_LD_FXSTATE_MXCSR A0
4966
4967 movdqu xmm0, [A2]
4968 movdqu xmm1, [A3]
4969 %1 xmm0, xmm1
4970 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4971
4972 SSE_ST_FXSTATE_MXCSR A1, A0
4973 IEMIMPL_SSE_EPILOGUE
4974 EPILOGUE_4_ARGS
4975ENDPROC iemAImpl_ %+ %1 %+ _u128
4976
4977BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4978 PROLOGUE_4_ARGS
4979 IEMIMPL_AVX_PROLOGUE
4980 AVX_LD_XSAVEAREA_MXCSR A0
4981
4982 vmovdqu xmm0, [A2]
4983 vmovdqu xmm1, [A3]
4984 v %+ %1 xmm0, xmm1
4985 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4986
4987 AVX128_ST_XSAVEAREA_MXCSR A1
4988 IEMIMPL_AVX_EPILOGUE
4989 EPILOGUE_4_ARGS
4990ENDPROC iemAImpl_v %+ %1 %+ _u128
4991
4992BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4993 PROLOGUE_4_ARGS
4994 IEMIMPL_AVX_PROLOGUE
4995 AVX_LD_XSAVEAREA_MXCSR A0
4996
4997 vmovdqu ymm0, [A2]
4998 vmovdqu ymm1, [A3]
4999 %if %2 == 0
5000 v %+ %1 xmm0, ymm1
5001 %else
5002 v %+ %1 ymm0, xmm1
5003 %endif
5004 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5005
5006 AVX256_ST_XSAVEAREA_MXCSR A1
5007 IEMIMPL_AVX_EPILOGUE
5008 EPILOGUE_4_ARGS
5009ENDPROC iemAImpl_v %+ %1 %+ _u256
5010%endmacro
5011
5012IEMIMPL_CVT_F2 cvtpd2ps, 0
5013IEMIMPL_CVT_F2 cvtps2pd, 1
5014
5015
5016;;
5017; shufps instructions with 8-bit immediates.
5018;
5019; @param A0 Pointer to the destination media register size operand (input/output).
5020; @param A1 Pointer to the first source media register size operand (input).
5021; @param A2 The 8-bit immediate
5022;
5023BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5024 PROLOGUE_3_ARGS
5025 IEMIMPL_SSE_PROLOGUE
5026
5027 movdqu xmm0, [A0]
5028 movdqu xmm1, [A1]
5029 lea T1, [.imm0 xWrtRIP]
5030 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5031 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5032 %else
5033 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5034 %endif
5035 lea T1, [T1 + T0*2]
5036 IBT_NOTRACK
5037 call T1
5038 movdqu [A0], xmm0
5039
5040 IEMIMPL_SSE_EPILOGUE
5041 EPILOGUE_3_ARGS
5042 %assign bImm 0
5043 %rep 256
5044.imm %+ bImm:
5045 IBT_ENDBRxx_WITHOUT_NOTRACK
5046 shufps xmm0, xmm1, bImm
5047 ret
5048 int3
5049 %assign bImm bImm + 1
5050 %endrep
5051.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5052ENDPROC iemAImpl_shufps_u128
5053
5054
5055;;
5056; shufpd instruction with 8-bit immediates.
5057;
5058; @param A0 Pointer to the destination media register size operand (input/output).
5059; @param A1 Pointer to the first source media register size operand (input).
5060; @param A2 The 8-bit immediate
5061;
5062BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5063 PROLOGUE_3_ARGS
5064 IEMIMPL_SSE_PROLOGUE
5065
5066 movdqu xmm0, [A0]
5067 movdqu xmm1, [A1]
5068 lea T1, [.imm0 xWrtRIP]
5069 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5070 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5071 %else
5072 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5073 %endif
5074 lea T1, [T1 + T0*2]
5075 IBT_NOTRACK
5076 call T1
5077 movdqu [A0], xmm0
5078
5079 IEMIMPL_SSE_EPILOGUE
5080 EPILOGUE_3_ARGS
5081 %assign bImm 0
5082 %rep 256
5083.imm %+ bImm:
5084 IBT_ENDBRxx_WITHOUT_NOTRACK
5085 shufpd xmm0, xmm1, bImm
5086 ret
5087 %assign bImm bImm + 1
5088 %endrep
5089.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5090ENDPROC iemAImpl_shufpd_u128
5091
5092
5093;;
5094; vshufp{s,d} instructions with 8-bit immediates.
5095;
5096; @param 1 The instruction name.
5097;
5098; @param A0 Pointer to the destination media register size operand (output).
5099; @param A1 Pointer to the first source media register size operand (input).
5100; @param A2 Pointer to the second source media register size operand (input).
5101; @param A3 The 8-bit immediate
5102;
5103%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5105 PROLOGUE_4_ARGS
5106 IEMIMPL_AVX_PROLOGUE
5107
5108 movdqu xmm0, [A1]
5109 movdqu xmm1, [A2]
5110 lea T1, [.imm0 xWrtRIP]
5111 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5112 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5113 %else
5114 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5115 %endif
5116 lea T1, [T1 + T0*2]
5117 IBT_NOTRACK
5118 call T1
5119 movdqu [A0], xmm0
5120
5121 IEMIMPL_AVX_EPILOGUE
5122 EPILOGUE_4_ARGS
5123 %assign bImm 0
5124 %rep 256
5125.imm %+ bImm:
5126 IBT_ENDBRxx_WITHOUT_NOTRACK
5127 %1 xmm0, xmm0, xmm1, bImm
5128 ret
5129 %assign bImm bImm + 1
5130 %endrep
5131.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5132ENDPROC iemAImpl_ %+ %1 %+ _u128
5133
5134BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5135 PROLOGUE_4_ARGS
5136 IEMIMPL_AVX_PROLOGUE
5137
5138 vmovdqu ymm0, [A1]
5139 vmovdqu ymm1, [A2]
5140 lea T1, [.imm0 xWrtRIP]
5141 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5142 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5143 %else
5144 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5145 %endif
5146 lea T1, [T1 + T0*2]
5147 IBT_NOTRACK
5148 call T1
5149 vmovdqu [A0], ymm0
5150
5151 IEMIMPL_AVX_EPILOGUE
5152 EPILOGUE_4_ARGS
5153 %assign bImm 0
5154 %rep 256
5155.imm %+ bImm:
5156 IBT_ENDBRxx_WITHOUT_NOTRACK
5157 %1 ymm0, ymm0, ymm1, bImm
5158 ret
5159 %assign bImm bImm + 1
5160 %endrep
5161.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5162ENDPROC iemAImpl_ %+ %1 %+ _u256
5163%endmacro
5164
5165IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5166IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5167
5168
5169;;
5170; One of the [p]blendv{b,ps,pd} variants
5171;
5172; @param 1 The instruction
5173;
5174; @param A0 Pointer to the first media register sized operand (input/output).
5175; @param A1 Pointer to the second media sized value (input).
5176; @param A2 Pointer to the media register sized mask value (input).
5177;
5178%macro IEMIMPL_P_BLEND 1
5179BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5180 PROLOGUE_3_ARGS
5181 IEMIMPL_SSE_PROLOGUE
5182
5183 movdqu xmm0, [A2] ; This is implicit
5184 movdqu xmm1, [A0]
5185 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5186 %1 xmm1, xmm2
5187 movdqu [A0], xmm1
5188
5189 IEMIMPL_SSE_PROLOGUE
5190 EPILOGUE_3_ARGS
5191ENDPROC iemAImpl_ %+ %1 %+ _u128
5192%endmacro
5193
5194IEMIMPL_P_BLEND pblendvb
5195IEMIMPL_P_BLEND blendvps
5196IEMIMPL_P_BLEND blendvpd
5197
5198
5199;;
5200; One of the v[p]blendv{b,ps,pd} variants
5201;
5202; @param 1 The instruction
5203;
5204; @param A0 Pointer to the first media register sized operand (output).
5205; @param A1 Pointer to the first media register sized operand (input).
5206; @param A2 Pointer to the second media register sized operand (input).
5207; @param A3 Pointer to the media register sized mask value (input).
5208%macro IEMIMPL_AVX_P_BLEND 1
5209BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5210 PROLOGUE_4_ARGS
5211 IEMIMPL_AVX_PROLOGUE
5212
5213 vmovdqu xmm0, [A1]
5214 vmovdqu xmm1, [A2]
5215 vmovdqu xmm2, [A3]
5216 %1 xmm0, xmm0, xmm1, xmm2
5217 vmovdqu [A0], xmm0
5218
5219 IEMIMPL_AVX_PROLOGUE
5220 EPILOGUE_4_ARGS
5221ENDPROC iemAImpl_ %+ %1 %+ _u128
5222
5223BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5224 PROLOGUE_4_ARGS
5225 IEMIMPL_AVX_PROLOGUE
5226
5227 vmovdqu ymm0, [A1]
5228 vmovdqu ymm1, [A2]
5229 vmovdqu ymm2, [A3]
5230 %1 ymm0, ymm0, ymm1, ymm2
5231 vmovdqu [A0], ymm0
5232
5233 IEMIMPL_AVX_PROLOGUE
5234 EPILOGUE_4_ARGS
5235ENDPROC iemAImpl_ %+ %1 %+ _u256
5236%endmacro
5237
5238IEMIMPL_AVX_P_BLEND vpblendvb
5239IEMIMPL_AVX_P_BLEND vblendvps
5240IEMIMPL_AVX_P_BLEND vblendvpd
5241
5242
5243;;
5244; palignr mm1, mm2/m64 instruction.
5245;
5246; @param A0 Pointer to the first media register sized operand (output).
5247; @param A1 The second register sized operand (input).
5248; @param A2 The 8-bit immediate.
5249BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5250 PROLOGUE_3_ARGS
5251 IEMIMPL_MMX_PROLOGUE
5252
5253 movq mm0, [A0]
5254 movq mm1, A1
5255 lea T1, [.imm0 xWrtRIP]
5256 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5257 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5258 %else
5259 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5260 %endif
5261 lea T1, [T1 + T0*2]
5262 IBT_NOTRACK
5263 call T1
5264 movq [A0], mm0
5265
5266 IEMIMPL_MMX_EPILOGUE
5267 EPILOGUE_3_ARGS
5268 %assign bImm 0
5269 %rep 256
5270.imm %+ bImm:
5271 IBT_ENDBRxx_WITHOUT_NOTRACK
5272 palignr mm0, mm1, bImm
5273 ret
5274 %assign bImm bImm + 1
5275 %endrep
5276.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5277ENDPROC iemAImpl_palignr_u64
5278
5279
5280;;
5281; SSE instructions with 8-bit immediates of the form
5282; xxx xmm1, xmm2, imm8.
5283; where the instruction encoding takes up 6 bytes.
5284;
5285; @param 1 The instruction name.
5286;
5287; @param A0 Pointer to the first media register size operand (input/output).
5288; @param A1 Pointer to the second source media register size operand (input).
5289; @param A2 The 8-bit immediate
5290;
5291%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5292BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5293 PROLOGUE_3_ARGS
5294 IEMIMPL_SSE_PROLOGUE
5295
5296 movdqu xmm0, [A0]
5297 movdqu xmm1, [A1]
5298 lea T1, [.imm0 xWrtRIP]
5299 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5300 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5301 lea T1, [T1 + T0*4]
5302 %else
5303 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5304 %endif
5305 IBT_NOTRACK
5306 call T1
5307 movdqu [A0], xmm0
5308
5309 IEMIMPL_SSE_EPILOGUE
5310 EPILOGUE_3_ARGS
5311 %assign bImm 0
5312 %rep 256
5313.imm %+ bImm:
5314 IBT_ENDBRxx_WITHOUT_NOTRACK
5315 %1 xmm0, xmm1, bImm
5316 ret
5317 int3
5318 %assign bImm bImm + 1
5319 %endrep
5320.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5321ENDPROC iemAImpl_ %+ %1 %+ _u128
5322%endmacro
5323
5324IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5325IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5326IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5327IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5328IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5329IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5330IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5331
5332
5333;;
5334; AVX instructions with 8-bit immediates of the form
5335; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5336; where the instruction encoding takes up 6 bytes.
5337;
5338; @param 1 The instruction name.
5339; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5340; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5341;
5342; @param A0 Pointer to the destination media register size operand (output).
5343; @param A1 Pointer to the first source media register size operand (input).
5344; @param A2 Pointer to the second source media register size operand (input).
5345; @param A3 The 8-bit immediate
5346;
5347%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5348 %if %2 == 1
5349BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5350 PROLOGUE_4_ARGS
5351 IEMIMPL_AVX_PROLOGUE
5352
5353 movdqu xmm0, [A1]
5354 movdqu xmm1, [A2]
5355 lea T1, [.imm0 xWrtRIP]
5356 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5357 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5358 lea T1, [T1 + T0*4]
5359 %else
5360 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5361 %endif
5362 IBT_NOTRACK
5363 call T1
5364 movdqu [A0], xmm0
5365
5366 IEMIMPL_AVX_EPILOGUE
5367 EPILOGUE_4_ARGS
5368 %assign bImm 0
5369 %rep 256
5370.imm %+ bImm:
5371 IBT_ENDBRxx_WITHOUT_NOTRACK
5372 %1 xmm0, xmm0, xmm1, bImm
5373 ret
5374 int3
5375 %assign bImm bImm + 1
5376 %endrep
5377.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5378ENDPROC iemAImpl_ %+ %1 %+ _u128
5379 %endif
5380
5381 %if %3 == 1
5382BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5383 PROLOGUE_4_ARGS
5384 IEMIMPL_AVX_PROLOGUE
5385
5386 vmovdqu ymm0, [A1]
5387 vmovdqu ymm1, [A2]
5388 lea T1, [.imm0 xWrtRIP]
5389 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5390 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5391 lea T1, [T1 + T0*4]
5392 %else
5393 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5394 %endif
5395 IBT_NOTRACK
5396 call T1
5397 vmovdqu [A0], ymm0
5398
5399 IEMIMPL_AVX_EPILOGUE
5400 EPILOGUE_4_ARGS
5401 %assign bImm 0
5402 %rep 256
5403.imm %+ bImm:
5404 IBT_ENDBRxx_WITHOUT_NOTRACK
5405 %1 ymm0, ymm0, ymm1, bImm
5406 ret
5407 int3
5408 %assign bImm bImm + 1
5409 %endrep
5410.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5411ENDPROC iemAImpl_ %+ %1 %+ _u256
5412 %endif
5413%endmacro
5414
5415IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5416IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5417IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5418IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5419IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5420IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5421IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5422
5423
5424;;
5425; Need to move this as well somewhere better?
5426;
5427struc IEMPCMPISTRXSRC
5428 .uSrc1 resd 4
5429 .uSrc2 resd 4
5430endstruc
5431
5432struc IEMPCMPESTRXSRC
5433 .uSrc1 resd 4
5434 .uSrc2 resd 4
5435 .u64Rax resd 2
5436 .u64Rdx resd 2
5437endstruc
5438
5439;;
5440; The pcmpistri instruction.
5441;
5442; @param A0 Pointer to the ECX register to store the result to (output).
5443; @param A1 Pointer to the EFLAGS register.
5444; @param A2 Pointer to the structure containing the source operands (input).
5445; @param A3 The 8-bit immediate
5446;
5447BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5448 PROLOGUE_4_ARGS
5449 IEMIMPL_SSE_PROLOGUE
5450
5451 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5452 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5453 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5454 lea T1, [.imm0 xWrtRIP]
5455 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5456 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5457 lea T1, [T1 + T0*4]
5458 %else
5459 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5460 %endif
5461 IBT_NOTRACK
5462 call T1
5463
5464 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5465 mov [T2], ecx
5466
5467 IEMIMPL_SSE_EPILOGUE
5468 EPILOGUE_4_ARGS
5469 %assign bImm 0
5470 %rep 256
5471.imm %+ bImm:
5472 IBT_ENDBRxx_WITHOUT_NOTRACK
5473 pcmpistri xmm0, xmm1, bImm
5474 ret
5475 int3
5476 %assign bImm bImm + 1
5477 %endrep
5478.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5479ENDPROC iemAImpl_pcmpistri_u128
5480
5481;;
5482; The pcmpestri instruction.
5483;
5484; @param A0 Pointer to the ECX register to store the result to (output).
5485; @param A1 Pointer to the EFLAGS register.
5486; @param A2 Pointer to the structure containing the source operands (input).
5487; @param A3 The 8-bit immediate
5488;
5489BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5490 PROLOGUE_4_ARGS
5491 IEMIMPL_SSE_PROLOGUE
5492
5493 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5494 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5495 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5496 lea T1, [.imm0 xWrtRIP]
5497 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5498 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5499 lea T1, [T1 + T0*4]
5500 %else
5501 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5502 %endif
5503 push xDX ; xDX can be A1 or A2 depending on the calling convention
5504 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5505 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5506 IBT_NOTRACK
5507 call T1
5508
5509 pop xDX
5510 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5511 mov [T2], ecx
5512
5513 IEMIMPL_SSE_EPILOGUE
5514 EPILOGUE_4_ARGS
5515 %assign bImm 0
5516 %rep 256
5517.imm %+ bImm:
5518 IBT_ENDBRxx_WITHOUT_NOTRACK
5519 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5520 pcmpestri xmm0, xmm1, bImm
5521 ret
5522 %assign bImm bImm + 1
5523 %endrep
5524.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5525ENDPROC iemAImpl_pcmpestri_u128
5526
5527;;
5528; The pcmpistrm instruction template.
5529;
5530; @param A0 Pointer to the XMM0 register to store the result to (output).
5531; @param A1 Pointer to the EFLAGS register.
5532; @param A2 Pointer to the structure containing the source operands (input).
5533; @param A3 The 8-bit immediate
5534;
5535BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5536 PROLOGUE_4_ARGS
5537 IEMIMPL_SSE_PROLOGUE
5538
5539 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5540 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5541 lea T1, [.imm0 xWrtRIP]
5542 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5543 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5544 lea T1, [T1 + T0*4]
5545 %else
5546 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5547 %endif
5548 IBT_NOTRACK
5549 call T1
5550
5551 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5552 movdqu [A0], xmm0
5553
5554 IEMIMPL_SSE_EPILOGUE
5555 EPILOGUE_4_ARGS
5556 %assign bImm 0
5557 %rep 256
5558.imm %+ bImm:
5559 IBT_ENDBRxx_WITHOUT_NOTRACK
5560 pcmpistrm xmm1, xmm2, bImm
5561 ret
5562 int3
5563 %assign bImm bImm + 1
5564 %endrep
5565.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5566ENDPROC iemAImpl_pcmpistrm_u128
5567
5568;;
5569; The pcmpestrm instruction template.
5570;
5571; @param A0 Pointer to the XMM0 register to store the result to (output).
5572; @param A1 Pointer to the EFLAGS register.
5573; @param A2 Pointer to the structure containing the source operands (input).
5574; @param A3 The 8-bit immediate
5575;
5576BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5577 PROLOGUE_4_ARGS
5578 IEMIMPL_SSE_PROLOGUE
5579
5580 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5581 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5582 lea T1, [.imm0 xWrtRIP]
5583 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5584 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5585 lea T1, [T1 + T0*4]
5586 %else
5587 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5588 %endif
5589 push xDX ; xDX can be A1 or A2 depending on the calling convention
5590 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5591 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5592 IBT_NOTRACK
5593 call T1
5594
5595 pop xDX
5596 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5597 movdqu [A0], xmm0
5598
5599 IEMIMPL_SSE_EPILOGUE
5600 EPILOGUE_4_ARGS
5601 %assign bImm 0
5602 %rep 256
5603.imm %+ bImm:
5604 IBT_ENDBRxx_WITHOUT_NOTRACK
5605 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5606 pcmpestrm xmm1, xmm2, bImm
5607 ret
5608 %assign bImm bImm + 1
5609 %endrep
5610.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5611ENDPROC iemAImpl_pcmpestrm_u128
5612
5613
5614;;
5615; pinsrw instruction.
5616;
5617; @param A0 Pointer to the first media register size operand (input/output).
5618; @param A1 The 16 bit input operand (input).
5619; @param A2 The 8-bit immediate
5620;
5621BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5622 PROLOGUE_3_ARGS
5623 IEMIMPL_SSE_PROLOGUE
5624
5625 movq mm0, [A0]
5626 lea T1, [.imm0 xWrtRIP]
5627 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5628 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5629 %else
5630 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5631 %endif
5632 lea T1, [T1 + T0]
5633 IBT_NOTRACK
5634 call T1
5635 movq [A0], mm0
5636
5637 IEMIMPL_SSE_EPILOGUE
5638 EPILOGUE_3_ARGS
5639 %assign bImm 0
5640 %rep 256
5641.imm %+ bImm:
5642 IBT_ENDBRxx_WITHOUT_NOTRACK
5643 pinsrw mm0, A1_32, bImm
5644 ret
5645 %assign bImm bImm + 1
5646 %endrep
5647.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5648ENDPROC iemAImpl_pinsrw_u64
5649
5650BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5651 PROLOGUE_3_ARGS
5652 IEMIMPL_SSE_PROLOGUE
5653
5654 movdqu xmm0, [A0]
5655 lea T1, [.imm0 xWrtRIP]
5656 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5657 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5658 %else
5659 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5660 %endif
5661 lea T1, [T1 + T0*2]
5662 IBT_NOTRACK
5663 call T1
5664 movdqu [A0], xmm0
5665
5666 IEMIMPL_SSE_EPILOGUE
5667 EPILOGUE_3_ARGS
5668 %assign bImm 0
5669 %rep 256
5670.imm %+ bImm:
5671 IBT_ENDBRxx_WITHOUT_NOTRACK
5672 pinsrw xmm0, A1_32, bImm
5673 ret
5674 %assign bImm bImm + 1
5675 %endrep
5676.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5677ENDPROC iemAImpl_pinsrw_u128
5678
5679;;
5680; vpinsrw instruction.
5681;
5682; @param A0 Pointer to the first media register size operand (output).
5683; @param A1 Pointer to the source media register size operand (input).
5684; @param A2 The 16 bit input operand (input).
5685; @param A3 The 8-bit immediate
5686;
5687BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5688 PROLOGUE_4_ARGS
5689 IEMIMPL_SSE_PROLOGUE
5690
5691 movdqu xmm0, [A1]
5692 lea T1, [.imm0 xWrtRIP]
5693 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5694 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5695 %else
5696 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5697 %endif
5698 lea T1, [T1 + T0*2]
5699 mov A1, A2 ; A2 requires longer encoding on Windows
5700 IBT_NOTRACK
5701 call T1
5702 movdqu [A0], xmm0
5703
5704 IEMIMPL_SSE_EPILOGUE
5705 EPILOGUE_4_ARGS
5706 %assign bImm 0
5707 %rep 256
5708.imm %+ bImm:
5709 IBT_ENDBRxx_WITHOUT_NOTRACK
5710 vpinsrw xmm0, xmm0, A1_32, bImm
5711 ret
5712 %assign bImm bImm + 1
5713 %endrep
5714.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5715ENDPROC iemAImpl_vpinsrw_u128
5716
5717
5718;;
5719; pextrw instruction.
5720;
5721; @param A0 Pointer to the 16bit output operand (output).
5722; @param A1 Pointer to the media register size operand (input).
5723; @param A2 The 8-bit immediate
5724;
5725BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5726 PROLOGUE_3_ARGS
5727 IEMIMPL_SSE_PROLOGUE
5728
5729 movq mm0, A1
5730 lea T1, [.imm0 xWrtRIP]
5731 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5732 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5733 %else
5734 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5735 %endif
5736 lea T1, [T1 + T0]
5737 IBT_NOTRACK
5738 call T1
5739 mov word [A0], T0_16
5740
5741 IEMIMPL_SSE_EPILOGUE
5742 EPILOGUE_3_ARGS
5743 %assign bImm 0
5744 %rep 256
5745.imm %+ bImm:
5746 IBT_ENDBRxx_WITHOUT_NOTRACK
5747 pextrw T0_32, mm0, bImm
5748 ret
5749 %assign bImm bImm + 1
5750 %endrep
5751.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5752ENDPROC iemAImpl_pextrw_u64
5753
5754BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5755 PROLOGUE_3_ARGS
5756 IEMIMPL_SSE_PROLOGUE
5757
5758 movdqu xmm0, [A1]
5759 lea T1, [.imm0 xWrtRIP]
5760 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5761 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5762 %else
5763 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5764 %endif
5765 lea T1, [T1 + T0*2]
5766 IBT_NOTRACK
5767 call T1
5768 mov word [A0], T0_16
5769
5770 IEMIMPL_SSE_EPILOGUE
5771 EPILOGUE_3_ARGS
5772 %assign bImm 0
5773 %rep 256
5774.imm %+ bImm:
5775 IBT_ENDBRxx_WITHOUT_NOTRACK
5776 pextrw T0_32, xmm0, bImm
5777 ret
5778 %assign bImm bImm + 1
5779 %endrep
5780.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5781ENDPROC iemAImpl_pextrw_u128
5782
5783;;
5784; vpextrw instruction.
5785;
5786; @param A0 Pointer to the 16bit output operand (output).
5787; @param A1 Pointer to the source media register size operand (input).
5788; @param A2 The 8-bit immediate
5789;
5790BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5791 PROLOGUE_3_ARGS
5792 IEMIMPL_SSE_PROLOGUE
5793
5794 movdqu xmm0, [A1]
5795 lea T1, [.imm0 xWrtRIP]
5796 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5797 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5798 %else
5799 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5800 %endif
5801 lea T1, [T1 + T0*2]
5802 IBT_NOTRACK
5803 call T1
5804 mov word [A0], T0_16
5805
5806 IEMIMPL_SSE_EPILOGUE
5807 EPILOGUE_3_ARGS
5808 %assign bImm 0
5809 %rep 256
5810.imm %+ bImm:
5811 IBT_ENDBRxx_WITHOUT_NOTRACK
5812 vpextrw T0_32, xmm0, bImm
5813 ret
5814 %assign bImm bImm + 1
5815 %endrep
5816.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5817ENDPROC iemAImpl_vpextrw_u128
5818
5819
5820;;
5821; movmskp{s,d} SSE instruction template
5822;
5823; @param 1 The SSE instruction name.
5824; @param 2 The AVX instruction name.
5825;
5826; @param A0 Pointer to the output register (output/byte sized).
5827; @param A1 Pointer to the source media register size operand (input).
5828;
5829%macro IEMIMPL_MEDIA_MOVMSK_P 2
5830BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5831 PROLOGUE_2_ARGS
5832 IEMIMPL_SSE_PROLOGUE
5833
5834 movdqu xmm0, [A1]
5835 %1 T0, xmm0
5836 mov byte [A0], T0_8
5837
5838 IEMIMPL_SSE_EPILOGUE
5839 EPILOGUE_2_ARGS
5840ENDPROC iemAImpl_ %+ %1 %+ _u128
5841
5842BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5843 PROLOGUE_2_ARGS
5844 IEMIMPL_AVX_PROLOGUE
5845
5846 movdqu xmm0, [A1]
5847 %2 T0, xmm0
5848 mov byte [A0], T0_8
5849
5850 IEMIMPL_AVX_EPILOGUE
5851 EPILOGUE_2_ARGS
5852ENDPROC iemAImpl_ %+ %2 %+ _u128
5853
5854BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5855 PROLOGUE_2_ARGS
5856 IEMIMPL_AVX_PROLOGUE
5857
5858 vmovdqu ymm0, [A1]
5859 %2 T0, ymm0
5860 mov byte [A0], T0_8
5861
5862 IEMIMPL_AVX_EPILOGUE
5863 EPILOGUE_2_ARGS
5864ENDPROC iemAImpl_ %+ %2 %+ _u256
5865%endmacro
5866
5867IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5868IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5869
5870
5871;;
5872; Restores the SSE MXCSR register with the original value.
5873;
5874; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5875; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5876; @param 2 Expression giving the address of the FXSTATE of the guest.
5877;
5878; @note Restores the stack pointer.
5879;
5880%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5881 sub xSP, 4
5882 stmxcsr [xSP]
5883 mov T0_32, [xSP]
5884 add xSP, 4
5885 ; Merge the status bits into the original MXCSR value.
5886 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5887 and T0_32, X86_MXCSR_XCPT_FLAGS
5888 or T0_32, T1_32
5889 mov [%1], T0_32
5890
5891 ldmxcsr [xSP]
5892 add xSP, 4
5893%endmacro
5894
5895
5896;;
5897; cvttsd2si instruction - 32-bit variant.
5898;
5899; @param A0 FPU context (FXSTATE or XSAVEAREA).
5900; @param A1 Where to return the MXCSR value.
5901; @param A2 Pointer to the result operand (output).
5902; @param A3 Pointer to the second operand (input).
5903;
5904BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5905 PROLOGUE_4_ARGS
5906 IEMIMPL_SSE_PROLOGUE
5907 SSE_LD_FXSTATE_MXCSR A0
5908
5909 cvttsd2si T0_32, [A3]
5910 mov dword [A2], T0_32
5911
5912 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5913 IEMIMPL_SSE_EPILOGUE
5914 EPILOGUE_4_ARGS
5915ENDPROC iemAImpl_cvttsd2si_i32_r64
5916
5917;;
5918; cvttsd2si instruction - 64-bit variant.
5919;
5920; @param A0 FPU context (FXSTATE or XSAVEAREA).
5921; @param A1 Where to return the MXCSR value.
5922; @param A2 Pointer to the result operand (output).
5923; @param A3 Pointer to the second operand (input).
5924;
5925BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5926 PROLOGUE_4_ARGS
5927 IEMIMPL_SSE_PROLOGUE
5928 SSE_LD_FXSTATE_MXCSR A0
5929
5930 cvttsd2si T0, [A3]
5931 mov qword [A2], T0
5932
5933 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5934 IEMIMPL_SSE_EPILOGUE
5935 EPILOGUE_4_ARGS
5936ENDPROC iemAImpl_cvttsd2si_i64_r64
5937
5938
5939;;
5940; cvtsd2si instruction - 32-bit variant.
5941;
5942; @param A0 FPU context (FXSTATE or XSAVEAREA).
5943; @param A1 Where to return the MXCSR value.
5944; @param A2 Pointer to the result operand (output).
5945; @param A3 Pointer to the second operand (input).
5946;
5947BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5948 PROLOGUE_4_ARGS
5949 IEMIMPL_SSE_PROLOGUE
5950 SSE_LD_FXSTATE_MXCSR A0
5951
5952 cvtsd2si T0_32, [A3]
5953 mov dword [A2], T0_32
5954
5955 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5956 IEMIMPL_SSE_EPILOGUE
5957 EPILOGUE_4_ARGS
5958ENDPROC iemAImpl_cvtsd2si_i32_r64
5959
5960;;
5961; cvtsd2si instruction - 64-bit variant.
5962;
5963; @param A0 FPU context (FXSTATE or XSAVEAREA).
5964; @param A1 Where to return the MXCSR value.
5965; @param A2 Pointer to the result operand (output).
5966; @param A3 Pointer to the second operand (input).
5967;
5968BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5969 PROLOGUE_4_ARGS
5970 IEMIMPL_SSE_PROLOGUE
5971 SSE_LD_FXSTATE_MXCSR A0
5972
5973 cvtsd2si T0, [A3]
5974 mov qword [A2], T0
5975
5976 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5977 IEMIMPL_SSE_EPILOGUE
5978 EPILOGUE_4_ARGS
5979ENDPROC iemAImpl_cvtsd2si_i64_r64
5980
5981
5982;;
5983; cvttss2si instruction - 32-bit variant.
5984;
5985; @param A0 FPU context (FXSTATE or XSAVEAREA).
5986; @param A1 Where to return the MXCSR value.
5987; @param A2 Pointer to the result operand (output).
5988; @param A3 Pointer to the second operand (input).
5989;
5990BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5991 PROLOGUE_4_ARGS
5992 IEMIMPL_SSE_PROLOGUE
5993 SSE_LD_FXSTATE_MXCSR A0
5994
5995 cvttss2si T0_32, [A3]
5996 mov dword [A2], T0_32
5997
5998 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5999 IEMIMPL_SSE_EPILOGUE
6000 EPILOGUE_4_ARGS
6001ENDPROC iemAImpl_cvttss2si_i32_r32
6002
6003;;
6004; cvttss2si instruction - 64-bit variant.
6005;
6006; @param A0 FPU context (FXSTATE or XSAVEAREA).
6007; @param A1 Where to return the MXCSR value.
6008; @param A2 Pointer to the result operand (output).
6009; @param A3 Pointer to the second operand (input).
6010;
6011BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6012 PROLOGUE_4_ARGS
6013 IEMIMPL_SSE_PROLOGUE
6014 SSE_LD_FXSTATE_MXCSR A0
6015
6016 cvttss2si T0, [A3]
6017 mov qword [A2], T0
6018
6019 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6020 IEMIMPL_SSE_EPILOGUE
6021 EPILOGUE_4_ARGS
6022ENDPROC iemAImpl_cvttss2si_i64_r32
6023
6024
6025;;
6026; cvtss2si instruction - 32-bit variant.
6027;
6028; @param A0 FPU context (FXSTATE or XSAVEAREA).
6029; @param A1 Where to return the MXCSR value.
6030; @param A2 Pointer to the result operand (output).
6031; @param A3 Pointer to the second operand (input).
6032;
6033BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6034 PROLOGUE_4_ARGS
6035 IEMIMPL_SSE_PROLOGUE
6036 SSE_LD_FXSTATE_MXCSR A0
6037
6038 cvtss2si T0_32, [A3]
6039 mov dword [A2], T0_32
6040
6041 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6042 IEMIMPL_SSE_EPILOGUE
6043 EPILOGUE_4_ARGS
6044ENDPROC iemAImpl_cvtss2si_i32_r32
6045
6046;;
6047; cvtss2si instruction - 64-bit variant.
6048;
6049; @param A0 FPU context (FXSTATE or XSAVEAREA).
6050; @param A1 Where to return the MXCSR value.
6051; @param A2 Pointer to the result operand (output).
6052; @param A3 Pointer to the second operand (input).
6053;
6054BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6055 PROLOGUE_4_ARGS
6056 IEMIMPL_SSE_PROLOGUE
6057 SSE_LD_FXSTATE_MXCSR A0
6058
6059 cvtss2si T0, [A3]
6060 mov qword [A2], T0
6061
6062 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6063 IEMIMPL_SSE_EPILOGUE
6064 EPILOGUE_4_ARGS
6065ENDPROC iemAImpl_cvtss2si_i64_r32
6066
6067
6068;;
6069; cvtsi2ss instruction - 32-bit variant.
6070;
6071; @param A0 FPU context (FXSTATE or XSAVEAREA).
6072; @param A1 Where to return the MXCSR value.
6073; @param A2 Pointer to the result operand (output).
6074; @param A3 Pointer to the second operand (input).
6075;
6076BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6077 PROLOGUE_4_ARGS
6078 IEMIMPL_SSE_PROLOGUE
6079 SSE_LD_FXSTATE_MXCSR A0
6080
6081 cvtsi2ss xmm0, dword [A3]
6082 movd dword [A2], xmm0
6083
6084 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6085 IEMIMPL_SSE_EPILOGUE
6086 EPILOGUE_4_ARGS
6087ENDPROC iemAImpl_cvtsi2ss_r32_i32
6088
6089;;
6090; cvtsi2ss instruction - 64-bit variant.
6091;
6092; @param A0 FPU context (FXSTATE or XSAVEAREA).
6093; @param A1 Where to return the MXCSR value.
6094; @param A2 Pointer to the result operand (output).
6095; @param A3 Pointer to the second operand (input).
6096;
6097BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6098 PROLOGUE_4_ARGS
6099 IEMIMPL_SSE_PROLOGUE
6100 SSE_LD_FXSTATE_MXCSR A0
6101
6102 cvtsi2ss xmm0, qword [A3]
6103 movd dword [A2], xmm0
6104
6105 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6106 IEMIMPL_SSE_EPILOGUE
6107 EPILOGUE_4_ARGS
6108ENDPROC iemAImpl_cvtsi2ss_r32_i64
6109
6110
6111;;
6112; cvtsi2sd instruction - 32-bit variant.
6113;
6114; @param A0 FPU context (FXSTATE or XSAVEAREA).
6115; @param A1 Where to return the MXCSR value.
6116; @param A2 Pointer to the result operand (output).
6117; @param A3 Pointer to the second operand (input).
6118;
6119BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6120 PROLOGUE_4_ARGS
6121 IEMIMPL_SSE_PROLOGUE
6122 SSE_LD_FXSTATE_MXCSR A0
6123
6124 cvtsi2sd xmm0, dword [A3]
6125 movq [A2], xmm0
6126
6127 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6128 IEMIMPL_SSE_EPILOGUE
6129 EPILOGUE_4_ARGS
6130ENDPROC iemAImpl_cvtsi2sd_r64_i32
6131
6132;;
6133; cvtsi2sd instruction - 64-bit variant.
6134;
6135; @param A0 FPU context (FXSTATE or XSAVEAREA).
6136; @param A1 Where to return the MXCSR value.
6137; @param A2 Pointer to the result operand (output).
6138; @param A3 Pointer to the second operand (input).
6139;
6140BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6141 PROLOGUE_4_ARGS
6142 IEMIMPL_SSE_PROLOGUE
6143 SSE_LD_FXSTATE_MXCSR A0
6144
6145 cvtsi2sd xmm0, qword [A3]
6146 movq [A2], xmm0
6147
6148 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6149 IEMIMPL_SSE_EPILOGUE
6150 EPILOGUE_4_ARGS
6151ENDPROC iemAImpl_cvtsi2sd_r64_i64
6152
6153
6154;;
6155; Initialize the SSE MXCSR register using the guest value partially to
6156; account for rounding mode.
6157;
6158; @uses 4 bytes of stack to save the original value, T0.
6159; @param 1 Expression giving the address of the MXCSR register of the guest.
6160;
6161%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6162 sub xSP, 4
6163
6164 stmxcsr [xSP]
6165 mov T0_32, [%1]
6166 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6167 or T0_32, X86_MXCSR_XCPT_MASK
6168 sub xSP, 4
6169 mov [xSP], T0_32
6170 ldmxcsr [xSP]
6171 add xSP, 4
6172%endmacro
6173
6174
6175;;
6176; Restores the SSE MXCSR register with the original value.
6177;
6178; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6179; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6180;
6181; @note Restores the stack pointer.
6182;
6183%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6184 sub xSP, 4
6185 stmxcsr [xSP]
6186 mov T0_32, [xSP]
6187 add xSP, 4
6188 ; Merge the status bits into the original MXCSR value.
6189 mov T1_32, [%1]
6190 and T0_32, X86_MXCSR_XCPT_FLAGS
6191 or T0_32, T1_32
6192 mov [%1], T0_32
6193
6194 ldmxcsr [xSP]
6195 add xSP, 4
6196%endmacro
6197
6198
6199;
6200; UCOMISS (SSE)
6201;
6202; @param A0 Pointer to the MXCSR value (input/output).
6203; @param A1 Pointer to the EFLAGS value (input/output).
6204; @param A2 Pointer to the first source operand (aka readonly destination).
6205; @param A3 Pointer to the second source operand.
6206;
6207BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6208 PROLOGUE_4_ARGS
6209 IEMIMPL_SSE_PROLOGUE
6210 SSE_LD_FXSTATE_MXCSR_ONLY A0
6211
6212 movdqu xmm0, [A2]
6213 movdqu xmm1, [A3]
6214 ucomiss xmm0, xmm1
6215 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6216
6217 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6218 IEMIMPL_SSE_EPILOGUE
6219 EPILOGUE_4_ARGS
6220ENDPROC iemAImpl_ucomiss_u128
6221
6222BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6223 PROLOGUE_4_ARGS
6224 IEMIMPL_SSE_PROLOGUE
6225 SSE_LD_FXSTATE_MXCSR_ONLY A0
6226
6227 movdqu xmm0, [A2]
6228 movdqu xmm1, [A3]
6229 vucomiss xmm0, xmm1
6230 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6231
6232 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6233 IEMIMPL_SSE_EPILOGUE
6234 EPILOGUE_4_ARGS
6235ENDPROC iemAImpl_vucomiss_u128
6236
6237
6238;
6239; UCOMISD (SSE)
6240;
6241; @param A0 Pointer to the MXCSR value (input/output).
6242; @param A1 Pointer to the EFLAGS value (input/output).
6243; @param A2 Pointer to the first source operand (aka readonly destination).
6244; @param A3 Pointer to the second source operand.
6245;
6246BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6247 PROLOGUE_4_ARGS
6248 IEMIMPL_SSE_PROLOGUE
6249 SSE_LD_FXSTATE_MXCSR_ONLY A0
6250
6251 movdqu xmm0, [A2]
6252 movdqu xmm1, [A3]
6253 ucomisd xmm0, xmm1
6254 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6255
6256 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6257 IEMIMPL_SSE_EPILOGUE
6258 EPILOGUE_4_ARGS
6259ENDPROC iemAImpl_ucomisd_u128
6260
6261BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6262 PROLOGUE_4_ARGS
6263 IEMIMPL_SSE_PROLOGUE
6264 SSE_LD_FXSTATE_MXCSR_ONLY A0
6265
6266 movdqu xmm0, [A2]
6267 movdqu xmm1, [A3]
6268 vucomisd xmm0, xmm1
6269 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6270
6271 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6272 IEMIMPL_SSE_EPILOGUE
6273 EPILOGUE_4_ARGS
6274ENDPROC iemAImpl_vucomisd_u128
6275
6276;
6277; COMISS (SSE)
6278;
6279; @param A0 Pointer to the MXCSR value (input/output).
6280; @param A1 Pointer to the EFLAGS value (input/output).
6281; @param A2 Pointer to the first source operand (aka readonly destination).
6282; @param A3 Pointer to the second source operand.
6283;
6284BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6285 PROLOGUE_4_ARGS
6286 IEMIMPL_SSE_PROLOGUE
6287 SSE_LD_FXSTATE_MXCSR_ONLY A0
6288
6289 movdqu xmm0, [A2]
6290 movdqu xmm1, [A3]
6291 comiss xmm0, xmm1
6292 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6293
6294 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6295 IEMIMPL_SSE_EPILOGUE
6296 EPILOGUE_4_ARGS
6297ENDPROC iemAImpl_comiss_u128
6298
6299BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6300 PROLOGUE_4_ARGS
6301 IEMIMPL_SSE_PROLOGUE
6302 SSE_LD_FXSTATE_MXCSR_ONLY A0
6303
6304 movdqu xmm0, [A2]
6305 movdqu xmm1, [A3]
6306 vcomiss xmm0, xmm1
6307 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6308
6309 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6310 IEMIMPL_SSE_EPILOGUE
6311 EPILOGUE_4_ARGS
6312ENDPROC iemAImpl_vcomiss_u128
6313
6314
6315;
6316; COMISD (SSE)
6317;
6318; @param A0 Pointer to the MXCSR value (input/output).
6319; @param A1 Pointer to the EFLAGS value (input/output).
6320; @param A2 Pointer to the first source operand (aka readonly destination).
6321; @param A3 Pointer to the second source operand.
6322;
6323BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6324 PROLOGUE_4_ARGS
6325 IEMIMPL_SSE_PROLOGUE
6326 SSE_LD_FXSTATE_MXCSR_ONLY A0
6327
6328 movdqu xmm0, [A2]
6329 movdqu xmm1, [A3]
6330 comisd xmm0, xmm1
6331 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6332
6333 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6334 IEMIMPL_SSE_EPILOGUE
6335 EPILOGUE_4_ARGS
6336ENDPROC iemAImpl_comisd_u128
6337
6338BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6339 PROLOGUE_4_ARGS
6340 IEMIMPL_SSE_PROLOGUE
6341 SSE_LD_FXSTATE_MXCSR_ONLY A0
6342
6343 movdqu xmm0, [A2]
6344 movdqu xmm1, [A3]
6345 vcomisd xmm0, xmm1
6346 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6347
6348 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6349 IEMIMPL_SSE_EPILOGUE
6350 EPILOGUE_4_ARGS
6351ENDPROC iemAImpl_vcomisd_u128
6352
6353
6354;;
6355; Need to move this as well somewhere better?
6356;
6357struc IEMMEDIAF2XMMSRC
6358 .uSrc1 resd 4
6359 .uSrc2 resd 4
6360endstruc
6361
6362
6363;
6364; CMPPS (SSE)
6365;
6366; @param A0 Pointer to the MXCSR value (input/output).
6367; @param A1 Pointer to the first media register size operand (output).
6368; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6369; @param A3 The 8-bit immediate (input).
6370;
6371BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6372 PROLOGUE_4_ARGS
6373 IEMIMPL_SSE_PROLOGUE
6374 SSE_LD_FXSTATE_MXCSR_ONLY A0
6375
6376 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6377 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6378 lea T1, [.imm0 xWrtRIP]
6379 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6380 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6381 %else
6382 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6383 %endif
6384 lea T1, [T1 + T0]
6385 IBT_NOTRACK
6386 call T1
6387 movdqu [A1], xmm0
6388
6389 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6390 IEMIMPL_SSE_EPILOGUE
6391 EPILOGUE_4_ARGS
6392 %assign bImm 0
6393 %rep 256
6394.imm %+ bImm:
6395 IBT_ENDBRxx_WITHOUT_NOTRACK
6396 cmpps xmm0, xmm1, bImm
6397 ret
6398 %assign bImm bImm + 1
6399 %endrep
6400.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6401ENDPROC iemAImpl_cmpps_u128
6402
6403;;
6404; SSE instructions with 8-bit immediates of the form
6405; xxx xmm1, xmm2, imm8.
6406; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6407; register.
6408;
6409; @param 1 The instruction name.
6410;
6411; @param A0 Pointer to the MXCSR value (input/output).
6412; @param A1 Pointer to the first media register size operand (output).
6413; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6414; @param A3 The 8-bit immediate (input).
6415;
6416%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6417BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6418 PROLOGUE_4_ARGS
6419 IEMIMPL_SSE_PROLOGUE
6420 SSE_LD_FXSTATE_MXCSR_ONLY A0
6421
6422 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6423 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6424 lea T1, [.imm0 xWrtRIP]
6425 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6426 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6427 %else
6428 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6429 %endif
6430 lea T1, [T1 + T0*2]
6431 IBT_NOTRACK
6432 call T1
6433 movdqu [A1], xmm0
6434
6435 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6436 IEMIMPL_SSE_EPILOGUE
6437 EPILOGUE_4_ARGS
6438 %assign bImm 0
6439 %rep 256
6440.imm %+ bImm:
6441 IBT_ENDBRxx_WITHOUT_NOTRACK
6442 %1 xmm0, xmm1, bImm
6443 ret
6444 %assign bImm bImm + 1
6445 %endrep
6446.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6447ENDPROC iemAImpl_ %+ %1 %+ _u128
6448%endmacro
6449
6450IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6451IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6452IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6453
6454;;
6455; SSE instructions with 8-bit immediates of the form
6456; xxx xmm1, xmm2, imm8.
6457; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6458; register.
6459;
6460; @param 1 The instruction name.
6461;
6462; @param A0 Pointer to the MXCSR value (input/output).
6463; @param A1 Pointer to the first media register size operand (output).
6464; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6465; @param A3 The 8-bit immediate (input).
6466;
6467%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6468BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6469 PROLOGUE_4_ARGS
6470 IEMIMPL_SSE_PROLOGUE
6471 SSE_LD_FXSTATE_MXCSR_ONLY A0
6472
6473 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6474 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6475 lea T1, [.imm0 xWrtRIP]
6476 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6477 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6478 lea T1, [T1 + T0*4]
6479 %else
6480 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6481 %endif
6482 IBT_NOTRACK
6483 call T1
6484 movdqu [A1], xmm0
6485
6486 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6487 IEMIMPL_SSE_EPILOGUE
6488 EPILOGUE_4_ARGS
6489 %assign bImm 0
6490 %rep 256
6491.imm %+ bImm:
6492 IBT_ENDBRxx_WITHOUT_NOTRACK
6493 %1 xmm0, xmm1, bImm
6494 ret
6495 int3
6496 %assign bImm bImm + 1
6497 %endrep
6498.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6499ENDPROC iemAImpl_ %+ %1 %+ _u128
6500%endmacro
6501
6502IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6503IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6504IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6505IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6506IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6507IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6508
6509
6510;;
6511; SSE instructions of the form
6512; xxx mm, xmm.
6513; and we need to load and save the MXCSR register.
6514;
6515; @param 1 The instruction name.
6516;
6517; @param A0 Pointer to the MXCSR value (input/output).
6518; @param A1 Pointer to the first MMX register sized operand (output).
6519; @param A2 Pointer to the media register sized operand (input).
6520;
6521%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6522BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6523 PROLOGUE_3_ARGS
6524 IEMIMPL_SSE_PROLOGUE
6525 SSE_LD_FXSTATE_MXCSR_ONLY A0
6526
6527 movdqu xmm0, [A2]
6528 %1 mm0, xmm0
6529 movq [A1], mm0
6530
6531 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6532 IEMIMPL_SSE_EPILOGUE
6533 EPILOGUE_3_ARGS
6534ENDPROC iemAImpl_ %+ %1 %+ _u128
6535%endmacro
6536
6537IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6538IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6539
6540;;
6541; SSE instructions of the form
6542; xxx xmm, xmm/m64.
6543; and we need to load and save the MXCSR register.
6544;
6545; @param 1 The instruction name.
6546;
6547; @param A0 Pointer to the MXCSR value (input/output).
6548; @param A1 Pointer to the first media register sized operand (input/output).
6549; @param A2 The 64bit source value from a MMX media register (input)
6550;
6551%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6553 PROLOGUE_3_ARGS
6554 IEMIMPL_SSE_PROLOGUE
6555 SSE_LD_FXSTATE_MXCSR_ONLY A0
6556
6557 movdqu xmm0, [A1]
6558 movq mm0, A2
6559 %1 xmm0, mm0
6560 movdqu [A1], xmm0
6561
6562 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6563 IEMIMPL_SSE_EPILOGUE
6564 EPILOGUE_3_ARGS
6565ENDPROC iemAImpl_ %+ %1 %+ _u128
6566%endmacro
6567
6568IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6569IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6570
6571;;
6572; SSE instructions of the form
6573; xxx mm, xmm/m64.
6574; and we need to load and save the MXCSR register.
6575;
6576; @param 1 The instruction name.
6577;
6578; @param A0 Pointer to the MXCSR value (input/output).
6579; @param A1 Pointer to the first MMX media register sized operand (output).
6580; @param A2 The 64bit source value (input).
6581;
6582%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6584 PROLOGUE_3_ARGS
6585 IEMIMPL_SSE_PROLOGUE
6586 SSE_LD_FXSTATE_MXCSR_ONLY A0
6587
6588 movq xmm0, A2
6589 %1 mm0, xmm0
6590 movq [A1], mm0
6591
6592 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6593 IEMIMPL_SSE_EPILOGUE
6594 EPILOGUE_3_ARGS
6595ENDPROC iemAImpl_ %+ %1 %+ _u128
6596%endmacro
6597
6598IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6599IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6600
6601;
6602; All forms of RDRAND and RDSEED
6603;
6604; @param A0 Pointer to the destination operand.
6605; @param A1 Pointer to the EFLAGS value (input/output).
6606;
6607%macro IEMIMPL_RDRAND_RDSEED 3
6608BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6609 PROLOGUE_2_ARGS
6610
6611 %1 %2
6612 mov [A0], %2
6613 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6614
6615 EPILOGUE_2_ARGS
6616ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6617%endmacro
6618
6619IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6620IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6621IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6622IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6623IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6624IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6625
6626
6627;;
6628; sha1rnds4 xmm1, xmm2, imm8.
6629;
6630; @param 1 The instruction name.
6631;
6632; @param A0 Pointer to the first media register size operand (input/output).
6633; @param A1 Pointer to the second source media register size operand (input).
6634; @param A2 The 8-bit immediate
6635;
6636BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6637 PROLOGUE_3_ARGS
6638 IEMIMPL_SSE_PROLOGUE
6639
6640 movdqu xmm0, [A0]
6641 movdqu xmm1, [A1]
6642 lea T1, [.imm0 xWrtRIP]
6643 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6644 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6645 %else
6646 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6647 %endif
6648 lea T1, [T1 + T0*2]
6649 IBT_NOTRACK
6650 call T1
6651 movdqu [A0], xmm0
6652
6653 IEMIMPL_SSE_EPILOGUE
6654 EPILOGUE_3_ARGS
6655 %assign bImm 0
6656 %rep 256
6657.imm %+ bImm:
6658 IBT_ENDBRxx_WITHOUT_NOTRACK
6659 sha1rnds4 xmm0, xmm1, bImm
6660 ret
6661 %assign bImm bImm + 1
6662 %endrep
6663.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6664ENDPROC iemAImpl_sha1rnds4_u128
6665
6666
6667;;
6668; sha256rnds2 xmm1, xmm2, <XMM0>.
6669;
6670; @param 1 The instruction name.
6671;
6672; @param A0 Pointer to the first media register size operand (input/output).
6673; @param A1 Pointer to the second source media register size operand (input).
6674; @param A2 Pointer to the implicit XMM0 constants (input).
6675;
6676BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6677 PROLOGUE_3_ARGS
6678 IEMIMPL_SSE_PROLOGUE
6679
6680 movdqu xmm0, [A2]
6681 movdqu xmm1, [A0]
6682 movdqu xmm2, [A1]
6683 sha256rnds2 xmm1, xmm2
6684 movdqu [A0], xmm1
6685
6686 IEMIMPL_SSE_EPILOGUE
6687 EPILOGUE_3_ARGS
6688ENDPROC iemAImpl_sha256rnds2_u128
6689
6690
6691;
6692; 32-bit forms of ADCX and ADOX
6693;
6694; @param A0 Pointer to the destination operand (input/output).
6695; @param A1 Pointer to the EFLAGS value (input/output).
6696; @param A2 32-bit source operand 1 (input).
6697;
6698%macro IEMIMPL_ADX_32 2
6699BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6700 PROLOGUE_4_ARGS
6701
6702 IEM_LOAD_FLAGS A1, %2, 0
6703 %1 A2_32, [A0]
6704 mov [A0], A2_32
6705 IEM_SAVE_FLAGS A1, %2, 0
6706
6707 EPILOGUE_4_ARGS
6708ENDPROC iemAImpl_ %+ %1 %+ _u32
6709%endmacro
6710
6711;
6712; 64-bit forms of ADCX and ADOX
6713;
6714; @param A0 Pointer to the destination operand (input/output).
6715; @param A1 Pointer to the EFLAGS value (input/output).
6716; @param A2 64-bit source operand 1 (input).
6717;
6718%macro IEMIMPL_ADX_64 2
6719BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6720 PROLOGUE_4_ARGS
6721
6722 IEM_LOAD_FLAGS A1, %2, 0
6723 %1 A2, [A0]
6724 mov [A0], A2
6725 IEM_SAVE_FLAGS A1, %2, 0
6726
6727 EPILOGUE_4_ARGS
6728ENDPROC iemAImpl_ %+ %1 %+ _u64
6729%endmacro
6730
6731IEMIMPL_ADX_32 adcx, X86_EFL_CF
6732IEMIMPL_ADX_64 adcx, X86_EFL_CF
6733
6734IEMIMPL_ADX_32 adox, X86_EFL_OF
6735IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette