VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 100595

Last change on this file since 100595 was 100595, checked in by vboxsync, 19 months ago

VMM/IEM: Implement vpsubsb/vpsubsw instruction emulations, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 187.4 KB
Line 
1; $Id: IEMAllAImpl.asm 100595 2023-07-17 10:55:34Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90 IBT_ENDBRxx
91%endmacro
92
93
94;
95; We employ some macro assembly here to hid the calling convention differences.
96;
97%ifdef RT_ARCH_AMD64
98 %macro PROLOGUE_1_ARGS 0
99 %endmacro
100 %macro EPILOGUE_1_ARGS 0
101 ret
102 %endmacro
103 %macro EPILOGUE_1_ARGS_EX 0
104 ret
105 %endmacro
106
107 %macro PROLOGUE_2_ARGS 0
108 %endmacro
109 %macro EPILOGUE_2_ARGS 0
110 ret
111 %endmacro
112 %macro EPILOGUE_2_ARGS_EX 1
113 ret
114 %endmacro
115
116 %macro PROLOGUE_3_ARGS 0
117 %endmacro
118 %macro EPILOGUE_3_ARGS 0
119 ret
120 %endmacro
121 %macro EPILOGUE_3_ARGS_EX 1
122 ret
123 %endmacro
124
125 %macro PROLOGUE_4_ARGS 0
126 %endmacro
127 %macro EPILOGUE_4_ARGS 0
128 ret
129 %endmacro
130 %macro EPILOGUE_4_ARGS_EX 1
131 ret
132 %endmacro
133
134 %ifdef ASM_CALL64_GCC
135 %define A0 rdi
136 %define A0_32 edi
137 %define A0_16 di
138 %define A0_8 dil
139
140 %define A1 rsi
141 %define A1_32 esi
142 %define A1_16 si
143 %define A1_8 sil
144
145 %define A2 rdx
146 %define A2_32 edx
147 %define A2_16 dx
148 %define A2_8 dl
149
150 %define A3 rcx
151 %define A3_32 ecx
152 %define A3_16 cx
153 %endif
154
155 %ifdef ASM_CALL64_MSC
156 %define A0 rcx
157 %define A0_32 ecx
158 %define A0_16 cx
159 %define A0_8 cl
160
161 %define A1 rdx
162 %define A1_32 edx
163 %define A1_16 dx
164 %define A1_8 dl
165
166 %define A2 r8
167 %define A2_32 r8d
168 %define A2_16 r8w
169 %define A2_8 r8b
170
171 %define A3 r9
172 %define A3_32 r9d
173 %define A3_16 r9w
174 %endif
175
176 %define T0 rax
177 %define T0_32 eax
178 %define T0_16 ax
179 %define T0_8 al
180
181 %define T1 r11
182 %define T1_32 r11d
183 %define T1_16 r11w
184 %define T1_8 r11b
185
186 %define T2 r10 ; only AMD64
187 %define T2_32 r10d
188 %define T2_16 r10w
189 %define T2_8 r10b
190
191%else
192 ; x86
193 %macro PROLOGUE_1_ARGS 0
194 push edi
195 %endmacro
196 %macro EPILOGUE_1_ARGS 0
197 pop edi
198 ret 0
199 %endmacro
200 %macro EPILOGUE_1_ARGS_EX 1
201 pop edi
202 ret %1
203 %endmacro
204
205 %macro PROLOGUE_2_ARGS 0
206 push edi
207 %endmacro
208 %macro EPILOGUE_2_ARGS 0
209 pop edi
210 ret 0
211 %endmacro
212 %macro EPILOGUE_2_ARGS_EX 1
213 pop edi
214 ret %1
215 %endmacro
216
217 %macro PROLOGUE_3_ARGS 0
218 push ebx
219 mov ebx, [esp + 4 + 4]
220 push edi
221 %endmacro
222 %macro EPILOGUE_3_ARGS_EX 1
223 %if (%1) < 4
224 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
225 %endif
226 pop edi
227 pop ebx
228 ret %1
229 %endmacro
230 %macro EPILOGUE_3_ARGS 0
231 EPILOGUE_3_ARGS_EX 4
232 %endmacro
233
234 %macro PROLOGUE_4_ARGS 0
235 push ebx
236 push edi
237 push esi
238 mov ebx, [esp + 12 + 4 + 0]
239 mov esi, [esp + 12 + 4 + 4]
240 %endmacro
241 %macro EPILOGUE_4_ARGS_EX 1
242 %if (%1) < 8
243 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
244 %endif
245 pop esi
246 pop edi
247 pop ebx
248 ret %1
249 %endmacro
250 %macro EPILOGUE_4_ARGS 0
251 EPILOGUE_4_ARGS_EX 8
252 %endmacro
253
254 %define A0 ecx
255 %define A0_32 ecx
256 %define A0_16 cx
257 %define A0_8 cl
258
259 %define A1 edx
260 %define A1_32 edx
261 %define A1_16 dx
262 %define A1_8 dl
263
264 %define A2 ebx
265 %define A2_32 ebx
266 %define A2_16 bx
267 %define A2_8 bl
268
269 %define A3 esi
270 %define A3_32 esi
271 %define A3_16 si
272
273 %define T0 eax
274 %define T0_32 eax
275 %define T0_16 ax
276 %define T0_8 al
277
278 %define T1 edi
279 %define T1_32 edi
280 %define T1_16 di
281%endif
282
283
284;;
285; Load the relevant flags from [%1] if there are undefined flags (%3).
286;
287; @remarks Clobbers T0, stack. Changes EFLAGS.
288; @param A2 The register pointing to the flags.
289; @param 1 The parameter (A0..A3) pointing to the eflags.
290; @param 2 The set of modified flags.
291; @param 3 The set of undefined flags.
292;
293%macro IEM_MAYBE_LOAD_FLAGS 3
294 ;%if (%3) != 0
295 pushf ; store current flags
296 mov T0_32, [%1] ; load the guest flags
297 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
298 and T0_32, (%2 | %3) ; select the modified and undefined flags.
299 or [xSP], T0 ; merge guest flags with host flags.
300 popf ; load the mixed flags.
301 ;%endif
302%endmacro
303
304;;
305; Load the relevant flags from [%1].
306;
307; @remarks Clobbers T0, stack. Changes EFLAGS.
308; @param A2 The register pointing to the flags.
309; @param 1 The parameter (A0..A3) pointing to the eflags.
310; @param 2 The set of flags to load.
311; @param 3 The set of undefined flags.
312;
313%macro IEM_LOAD_FLAGS 3
314 pushf ; store current flags
315 mov T0_32, [%1] ; load the guest flags
316 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
317 and T0_32, (%2 | %3) ; select the modified and undefined flags.
318 or [xSP], T0 ; merge guest flags with host flags.
319 popf ; load the mixed flags.
320%endmacro
321
322;;
323; Update the flag.
324;
325; @remarks Clobbers T0, T1, stack.
326; @param 1 The register pointing to the EFLAGS.
327; @param 2 The mask of modified flags to save.
328; @param 3 The mask of undefined flags to (maybe) save.
329;
330%macro IEM_SAVE_FLAGS 3
331 %if (%2 | %3) != 0
332 pushf
333 pop T1
334 mov T0_32, [%1] ; flags
335 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
336 and T1_32, (%2 | %3) ; select the modified and undefined flags.
337 or T0_32, T1_32 ; combine the flags.
338 mov [%1], T0_32 ; save the flags.
339 %endif
340%endmacro
341
342;;
343; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
344;
345; @remarks Clobbers T0, T1, stack.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 Mask of additional flags to always set.
350;
351%macro IEM_SAVE_AND_ADJUST_FLAGS 4
352 %if (%2 | %3 | %4) != 0
353 pushf
354 pop T1
355 mov T0_32, [%1] ; load flags.
356 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
357 and T1_32, (%2) ; select the modified flags.
358 or T0_32, T1_32 ; combine the flags.
359 %if (%4) != 0
360 or T0_32, %4 ; add the always set flags.
361 %endif
362 mov [%1], T0_32 ; save the result.
363 %endif
364%endmacro
365
366;;
367; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
368; signed input (%4[%5]) and parity index (%6).
369;
370; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
371; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
372; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
373;
374; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
375; @param 1 The register pointing to the EFLAGS.
376; @param 2 The mask of modified flags to save.
377; @param 3 Mask of additional flags to always clear
378; @param 4 The result register to set SF by.
379; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
380; @param 6 The (full) register containing the parity table index. Will be modified!
381
382%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
383 %ifdef RT_ARCH_AMD64
384 pushf
385 pop T2
386 %else
387 push T0
388 pushf
389 pop T0
390 %endif
391 mov T1_32, [%1] ; load flags.
392 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
393 %ifdef RT_ARCH_AMD64
394 and T2_32, (%2) ; select the modified flags.
395 or T1_32, T2_32 ; combine the flags.
396 %else
397 and T0_32, (%2) ; select the modified flags.
398 or T1_32, T0_32 ; combine the flags.
399 pop T0
400 %endif
401
402 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
403 bt %4, %5 - 1
404 jnc %%sf_clear
405 or T1_32, X86_EFL_SF
406 %%sf_clear:
407
408 ; Parity last.
409 and %6, 0xff
410 %ifdef RT_ARCH_AMD64
411 lea T2, [NAME(g_afParity) xWrtRIP]
412 or T1_8, [T2 + %6]
413 %else
414 or T1_8, [NAME(g_afParity) + %6]
415 %endif
416
417 mov [%1], T1_32 ; save the result.
418%endmacro
419
420;;
421; Calculates the new EFLAGS using fixed clear and set bit masks.
422;
423; @remarks Clobbers T0.
424; @param 1 The register pointing to the EFLAGS.
425; @param 2 Mask of additional flags to always clear
426; @param 3 Mask of additional flags to always set.
427;
428%macro IEM_ADJUST_FLAGS 3
429 %if (%2 | %3) != 0
430 mov T0_32, [%1] ; Load flags.
431 %if (%2) != 0
432 and T0_32, ~(%2) ; Remove the always cleared flags.
433 %endif
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 mov [%1], T0_32 ; Save the result.
438 %endif
439%endmacro
440
441;;
442; Calculates the new EFLAGS using fixed clear and set bit masks.
443;
444; @remarks Clobbers T0, %4, EFLAGS.
445; @param 1 The register pointing to the EFLAGS.
446; @param 2 Mask of additional flags to always clear
447; @param 3 Mask of additional flags to always set.
448; @param 4 The (full) register containing the parity table index. Will be modified!
449;
450%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
451 mov T0_32, [%1] ; Load flags.
452 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
453 %if (%3) != 0
454 or T0_32, %3 ; Add the always set flags.
455 %endif
456 and %4, 0xff
457 %ifdef RT_ARCH_AMD64
458 lea T2, [NAME(g_afParity) xWrtRIP]
459 or T0_8, [T2 + %4]
460 %else
461 or T0_8, [NAME(g_afParity) + %4]
462 %endif
463 mov [%1], T0_32 ; Save the result.
464%endmacro
465
466
467;;
468; Checks that the size expression %1 matches %2 adjusted according to
469; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
470; @param 1 The jump array size assembly expression.
471; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
472;
473%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
475 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
476 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
477 %else
478 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
479 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
480 %endif
481%endmacro
482
483
484;*********************************************************************************************************************************
485;* External Symbols *
486;*********************************************************************************************************************************
487extern NAME(g_afParity)
488
489
490;;
491; Macro for implementing a binary operator.
492;
493; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
494; variants, except on 32-bit system where the 64-bit accesses requires hand
495; coding.
496;
497; All the functions takes a pointer to the destination memory operand in A0,
498; the source register operand in A1 and a pointer to eflags in A2.
499;
500; @param 1 The instruction mnemonic.
501; @param 2 Non-zero if there should be a locked version.
502; @param 3 The modified flags.
503; @param 4 The undefined flags.
504;
505%macro IEMIMPL_BIN_OP 4
506BEGINCODE
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64
539 %endif ; RT_ARCH_AMD64
540
541 %if %2 != 0 ; locked versions requested?
542
543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
544 PROLOGUE_3_ARGS
545 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
546 lock %1 byte [A0], A1_8
547 IEM_SAVE_FLAGS A2, %3, %4
548 EPILOGUE_3_ARGS
549ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
550
551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
552 PROLOGUE_3_ARGS
553 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
554 lock %1 word [A0], A1_16
555 IEM_SAVE_FLAGS A2, %3, %4
556 EPILOGUE_3_ARGS
557ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
558
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 dword [A0], A1_32
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS
565ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
566
567 %ifdef RT_ARCH_AMD64
568BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
569 PROLOGUE_3_ARGS
570 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
571 lock %1 qword [A0], A1
572 IEM_SAVE_FLAGS A2, %3, %4
573 EPILOGUE_3_ARGS_EX 8
574ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
575 %endif ; RT_ARCH_AMD64
576 %endif ; locked
577%endmacro
578
579; instr,lock, modified-flags, undefined flags
580IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
581IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
582IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
583IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
584IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
585IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
586IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
587IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
588IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
589
590
591;;
592; Macro for implementing a binary operator, VEX variant with separate input/output.
593;
594; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
595; where the 64-bit accesses requires hand coding.
596;
597; All the functions takes a pointer to the destination memory operand in A0,
598; the first source register operand in A1, the second source register operand
599; in A2 and a pointer to eflags in A3.
600;
601; @param 1 The instruction mnemonic.
602; @param 2 The modified flags.
603; @param 3 The undefined flags.
604;
605%macro IEMIMPL_VEX_BIN_OP 3
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
607 PROLOGUE_4_ARGS
608 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
609 %1 T0_32, A1_32, A2_32
610 mov [A0], T0_32
611 IEM_SAVE_FLAGS A3, %2, %3
612 EPILOGUE_4_ARGS
613ENDPROC iemAImpl_ %+ %1 %+ _u32
614
615 %ifdef RT_ARCH_AMD64
616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
617 PROLOGUE_4_ARGS
618 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
619 %1 T0, A1, A2
620 mov [A0], T0
621 IEM_SAVE_FLAGS A3, %2, %3
622 EPILOGUE_4_ARGS
623ENDPROC iemAImpl_ %+ %1 %+ _u64
624 %endif ; RT_ARCH_AMD64
625%endmacro
626
627; instr, modified-flags, undefined-flags
628IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
629IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
630IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
631
632;;
633; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
634;
635; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
636; where the 64-bit accesses requires hand coding.
637;
638; All the functions takes a pointer to the destination memory operand in A0,
639; the source register operand in A1 and a pointer to eflags in A2.
640;
641; @param 1 The instruction mnemonic.
642; @param 2 The modified flags.
643; @param 3 The undefined flags.
644;
645%macro IEMIMPL_VEX_BIN_OP_2 3
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0_32, [A0]
650 %1 T0_32, A1_32
651 mov [A0], T0_32
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u32
655
656 %ifdef RT_ARCH_AMD64
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
658 PROLOGUE_4_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
660 mov T0, [A0]
661 %1 T0, A1
662 mov [A0], T0
663 IEM_SAVE_FLAGS A2, %2, %3
664 EPILOGUE_4_ARGS
665ENDPROC iemAImpl_ %+ %1 %+ _u64
666 %endif ; RT_ARCH_AMD64
667%endmacro
668
669; instr, modified-flags, undefined-flags
670IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
671IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
672IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
673
674
675;;
676; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
677;
678; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
679; where the 64-bit accesses requires hand coding.
680;
681; All the functions takes a pointer to the destination memory operand in A0,
682; the first source register operand in A1, the second source register operand
683; in A2 and a pointer to eflags in A3.
684;
685; @param 1 The instruction mnemonic.
686; @param 2 Fallback instruction if applicable.
687; @param 3 Whether to emit fallback or not.
688;
689%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
691 PROLOGUE_3_ARGS
692 %1 T0_32, A1_32, A2_32
693 mov [A0], T0_32
694 EPILOGUE_3_ARGS
695ENDPROC iemAImpl_ %+ %1 %+ _u32
696
697 %if %3
698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
699 PROLOGUE_3_ARGS
700 %ifdef ASM_CALL64_GCC
701 mov cl, A2_8
702 %2 A1_32, cl
703 mov [A0], A1_32
704 %else
705 xchg A2, A0
706 %2 A1_32, cl
707 mov [A2], A1_32
708 %endif
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
711 %endif
712
713 %ifdef RT_ARCH_AMD64
714BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
715 PROLOGUE_3_ARGS
716 %1 T0, A1, A2
717 mov [A0], T0
718 EPILOGUE_3_ARGS
719ENDPROC iemAImpl_ %+ %1 %+ _u64
720
721 %if %3
722BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
723 PROLOGUE_3_ARGS
724 %ifdef ASM_CALL64_GCC
725 mov cl, A2_8
726 %2 A1, cl
727 mov [A0], A1_32
728 %else
729 xchg A2, A0
730 %2 A1, cl
731 mov [A2], A1_32
732 %endif
733 mov [A0], A1
734 EPILOGUE_3_ARGS
735ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
736 %endif
737 %endif ; RT_ARCH_AMD64
738%endmacro
739
740; instr, fallback instr, emit fallback
741IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
742IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
743IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
744IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
745IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
746
747
748;
749; RORX uses a immediate byte for the shift count, so we only do
750; fallback implementation of that one.
751;
752BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
753 PROLOGUE_3_ARGS
754 %ifdef ASM_CALL64_GCC
755 mov cl, A2_8
756 ror A1_32, cl
757 mov [A0], A1_32
758 %else
759 xchg A2, A0
760 ror A1_32, cl
761 mov [A2], A1_32
762 %endif
763 EPILOGUE_3_ARGS
764ENDPROC iemAImpl_rorx_u32
765
766 %ifdef RT_ARCH_AMD64
767BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
768 PROLOGUE_3_ARGS
769 %ifdef ASM_CALL64_GCC
770 mov cl, A2_8
771 ror A1, cl
772 mov [A0], A1
773 %else
774 xchg A2, A0
775 ror A1, cl
776 mov [A2], A1
777 %endif
778 EPILOGUE_3_ARGS
779ENDPROC iemAImpl_rorx_u64
780 %endif ; RT_ARCH_AMD64
781
782
783;
784; MULX
785;
786BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX - prefect
790 mulx T0_32, T1_32, A3_32
791 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
792 mov [A0], T0_32
793%else
794 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
795 xchg A1, A2
796 mulx T0_32, T1_32, A3_32
797 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
798 mov [A0], T0_32
799%endif
800 EPILOGUE_4_ARGS
801ENDPROC iemAImpl_mulx_u32
802
803
804BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
805 PROLOGUE_4_ARGS
806%ifdef ASM_CALL64_GCC
807 ; A2_32 is EDX, T0_32 is EAX
808 mov eax, A3_32
809 mul A2_32
810 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%else
813 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
814 xchg A1, A2
815 mov eax, A3_32
816 mul A2_32
817 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], edx
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u32_fallback
822
823%ifdef RT_ARCH_AMD64
824BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX - prefect
828 mulx T0, T1, A3
829 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
830 mov [A0], T0
831%else
832 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
833 xchg A1, A2
834 mulx T0, T1, A3
835 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
836 mov [A0], T0
837%endif
838 EPILOGUE_4_ARGS
839ENDPROC iemAImpl_mulx_u64
840
841
842BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
843 PROLOGUE_4_ARGS
844%ifdef ASM_CALL64_GCC
845 ; A2 is RDX, T0 is RAX
846 mov rax, A3
847 mul A2
848 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%else
851 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
852 xchg A1, A2
853 mov rax, A3
854 mul A2
855 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
856 mov [A0], rdx
857%endif
858 EPILOGUE_4_ARGS
859ENDPROC iemAImpl_mulx_u64_fallback
860
861%endif
862
863
864;;
865; Macro for implementing a bit operator.
866;
867; This will generate code for the 16, 32 and 64 bit accesses with locked
868; variants, except on 32-bit system where the 64-bit accesses requires hand
869; coding.
870;
871; All the functions takes a pointer to the destination memory operand in A0,
872; the source register operand in A1 and a pointer to eflags in A2.
873;
874; @param 1 The instruction mnemonic.
875; @param 2 Non-zero if there should be a locked version.
876; @param 3 The modified flags.
877; @param 4 The undefined flags.
878;
879%macro IEMIMPL_BIT_OP 4
880BEGINCODE
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 word [A0], A1_16
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS
887ENDPROC iemAImpl_ %+ %1 %+ _u16
888
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 dword [A0], A1_32
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS
895ENDPROC iemAImpl_ %+ %1 %+ _u32
896
897 %ifdef RT_ARCH_AMD64
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 %1 qword [A0], A1
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS_EX 8
904ENDPROC iemAImpl_ %+ %1 %+ _u64
905 %endif ; RT_ARCH_AMD64
906
907 %if %2 != 0 ; locked versions requested?
908
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 word [A0], A1_16
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS
915ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
916
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 dword [A0], A1_32
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS
923ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
924
925 %ifdef RT_ARCH_AMD64
926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
927 PROLOGUE_3_ARGS
928 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
929 lock %1 qword [A0], A1
930 IEM_SAVE_FLAGS A2, %3, %4
931 EPILOGUE_3_ARGS_EX 8
932ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
933 %endif ; RT_ARCH_AMD64
934 %endif ; locked
935%endmacro
936IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
937IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
938IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
939IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
940
941;;
942; Macro for implementing a bit search operator.
943;
944; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
945; system where the 64-bit accesses requires hand coding.
946;
947; All the functions takes a pointer to the destination memory operand in A0,
948; the source register operand in A1 and a pointer to eflags in A2.
949;
950; In the ZF case the destination register is 'undefined', however it seems that
951; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
952; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
953; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
954; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
955;
956; @param 1 The instruction mnemonic.
957; @param 2 The modified flags.
958; @param 3 The undefined flags.
959; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
960;
961%macro IEMIMPL_BIT_OP2 4
962BEGINCODE
963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
964 PROLOGUE_3_ARGS
965 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
966 %1 T0_16, A1_16
967%if %4 != 0
968 jz .unchanged_dst
969%endif
970 mov [A0], T0_16
971.unchanged_dst:
972 IEM_SAVE_FLAGS A2, %2, %3
973 EPILOGUE_3_ARGS
974ENDPROC iemAImpl_ %+ %1 %+ _u16
975
976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
977 PROLOGUE_3_ARGS
978 %1 T1_16, A1_16
979%if %4 != 0
980 jz .unchanged_dst
981%endif
982 mov [A0], T1_16
983 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
984 EPILOGUE_3_ARGS
985.unchanged_dst:
986 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
987 EPILOGUE_3_ARGS
988ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
991 PROLOGUE_3_ARGS
992 %1 T0_16, A1_16
993%if %4 != 0
994 jz .unchanged_dst
995%endif
996 mov [A0], T0_16
997.unchanged_dst:
998 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
999 EPILOGUE_3_ARGS
1000ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1001
1002
1003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1004 PROLOGUE_3_ARGS
1005 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1006 %1 T0_32, A1_32
1007%if %4 != 0
1008 jz .unchanged_dst
1009%endif
1010 mov [A0], T0_32
1011.unchanged_dst:
1012 IEM_SAVE_FLAGS A2, %2, %3
1013 EPILOGUE_3_ARGS
1014ENDPROC iemAImpl_ %+ %1 %+ _u32
1015
1016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1017 PROLOGUE_3_ARGS
1018 %1 T1_32, A1_32
1019%if %4 != 0
1020 jz .unchanged_dst
1021%endif
1022 mov [A0], T1_32
1023 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1024 EPILOGUE_3_ARGS
1025.unchanged_dst:
1026 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1027 EPILOGUE_3_ARGS
1028ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1029
1030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1031 PROLOGUE_3_ARGS
1032 %1 T0_32, A1_32
1033%if %4 != 0
1034 jz .unchanged_dst
1035%endif
1036 mov [A0], T0_32
1037.unchanged_dst:
1038 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1039 EPILOGUE_3_ARGS
1040ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1041
1042
1043 %ifdef RT_ARCH_AMD64
1044
1045BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1046 PROLOGUE_3_ARGS
1047 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1048 %1 T0, A1
1049%if %4 != 0
1050 jz .unchanged_dst
1051%endif
1052 mov [A0], T0
1053.unchanged_dst:
1054 IEM_SAVE_FLAGS A2, %2, %3
1055 EPILOGUE_3_ARGS_EX 8
1056ENDPROC iemAImpl_ %+ %1 %+ _u64
1057
1058BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1059 PROLOGUE_3_ARGS
1060 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1061 %1 T1, A1
1062%if %4 != 0
1063 jz .unchanged_dst
1064%endif
1065 mov [A0], T1
1066 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1067 EPILOGUE_3_ARGS
1068.unchanged_dst:
1069 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1070 EPILOGUE_3_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1072
1073BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1074 PROLOGUE_3_ARGS
1075 %1 T0, A1
1076%if %4 != 0
1077 jz .unchanged_dst
1078%endif
1079 mov [A0], T0
1080.unchanged_dst:
1081 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1082 EPILOGUE_3_ARGS_EX 8
1083ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1084
1085 %endif ; RT_ARCH_AMD64
1086%endmacro
1087
1088IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1089IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1090IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1091IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1092
1093
1094;;
1095; Macro for implementing POPCNT.
1096;
1097; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1098; system where the 64-bit accesses requires hand coding.
1099;
1100; All the functions takes a pointer to the destination memory operand in A0,
1101; the source register operand in A1 and a pointer to eflags in A2.
1102;
1103; ASSUMES Intel and AMD set EFLAGS the same way.
1104;
1105; ASSUMES the instruction does not support memory destination.
1106;
1107; @param 1 The instruction mnemonic.
1108; @param 2 The modified flags.
1109; @param 3 The undefined flags.
1110;
1111%macro IEMIMPL_BIT_OP3 3
1112BEGINCODE
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_16, A1_16
1117 mov [A0], T0_16
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u16
1121
1122BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1123 PROLOGUE_3_ARGS
1124 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1125 %1 T0_32, A1_32
1126 mov [A0], T0_32
1127 IEM_SAVE_FLAGS A2, %2, %3
1128 EPILOGUE_3_ARGS
1129ENDPROC iemAImpl_ %+ %1 %+ _u32
1130
1131 %ifdef RT_ARCH_AMD64
1132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1133 PROLOGUE_3_ARGS
1134 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1135 %1 T0, A1
1136 mov [A0], T0
1137 IEM_SAVE_FLAGS A2, %2, %3
1138 EPILOGUE_3_ARGS_EX 8
1139ENDPROC iemAImpl_ %+ %1 %+ _u64
1140 %endif ; RT_ARCH_AMD64
1141%endmacro
1142IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1143
1144
1145;
1146; IMUL is also a similar but yet different case (no lock, no mem dst).
1147; The rDX:rAX variant of imul is handled together with mul further down.
1148;
1149BEGINCODE
1150; @param 1 EFLAGS that are modified.
1151; @param 2 Undefined EFLAGS.
1152; @param 3 Function suffix.
1153; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1154; 2 for AMD (set AF, clear PF, ZF and SF).
1155%macro IEMIMPL_IMUL_TWO 4
1156BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1159 imul A1_16, word [A0]
1160 mov [A0], A1_16
1161 %if %4 != 1
1162 IEM_SAVE_FLAGS A2, %1, %2
1163 %else
1164 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1165 %endif
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_imul_two_u16 %+ %3
1168
1169BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1170 PROLOGUE_3_ARGS
1171 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1172 imul A1_32, dword [A0]
1173 mov [A0], A1_32
1174 %if %4 != 1
1175 IEM_SAVE_FLAGS A2, %1, %2
1176 %else
1177 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1178 %endif
1179 EPILOGUE_3_ARGS
1180ENDPROC iemAImpl_imul_two_u32 %+ %3
1181
1182 %ifdef RT_ARCH_AMD64
1183BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1184 PROLOGUE_3_ARGS
1185 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1186 imul A1, qword [A0]
1187 mov [A0], A1
1188 %if %4 != 1
1189 IEM_SAVE_FLAGS A2, %1, %2
1190 %else
1191 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1192 %endif
1193 EPILOGUE_3_ARGS_EX 8
1194ENDPROC iemAImpl_imul_two_u64 %+ %3
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1198IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1199IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1200
1201
1202;
1203; XCHG for memory operands. This implies locking. No flag changes.
1204;
1205; Each function takes two arguments, first the pointer to the memory,
1206; then the pointer to the register. They all return void.
1207;
1208BEGINCODE
1209BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_8, [A1]
1212 xchg [A0], T0_8
1213 mov [A1], T0_8
1214 EPILOGUE_2_ARGS
1215ENDPROC iemAImpl_xchg_u8_locked
1216
1217BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0_16, [A1]
1220 xchg [A0], T0_16
1221 mov [A1], T0_16
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u16_locked
1224
1225BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0_32, [A1]
1228 xchg [A0], T0_32
1229 mov [A1], T0_32
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u32_locked
1232
1233%ifdef RT_ARCH_AMD64
1234BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0, [A1]
1237 xchg [A0], T0
1238 mov [A1], T0
1239 EPILOGUE_2_ARGS
1240ENDPROC iemAImpl_xchg_u64_locked
1241%endif
1242
1243; Unlocked variants for fDisregardLock mode.
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_8, [A1]
1248 mov T1_8, [A0]
1249 mov [A0], T0_8
1250 mov [A1], T1_8
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u8_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_16, [A1]
1257 mov T1_16, [A0]
1258 mov [A0], T0_16
1259 mov [A1], T1_16
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u16_unlocked
1262
1263BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1264 PROLOGUE_2_ARGS
1265 mov T0_32, [A1]
1266 mov T1_32, [A0]
1267 mov [A0], T0_32
1268 mov [A1], T1_32
1269 EPILOGUE_2_ARGS
1270ENDPROC iemAImpl_xchg_u32_unlocked
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1274 PROLOGUE_2_ARGS
1275 mov T0, [A1]
1276 mov T1, [A0]
1277 mov [A0], T0
1278 mov [A1], T1
1279 EPILOGUE_2_ARGS
1280ENDPROC iemAImpl_xchg_u64_unlocked
1281%endif
1282
1283
1284;
1285; XADD for memory operands.
1286;
1287; Each function takes three arguments, first the pointer to the
1288; memory/register, then the pointer to the register, and finally a pointer to
1289; eflags. They all return void.
1290;
1291BEGINCODE
1292BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1293 PROLOGUE_3_ARGS
1294 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 mov T0_8, [A1]
1296 xadd [A0], T0_8
1297 mov [A1], T0_8
1298 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1299 EPILOGUE_3_ARGS
1300ENDPROC iemAImpl_xadd_u8
1301
1302BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1303 PROLOGUE_3_ARGS
1304 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1305 mov T0_16, [A1]
1306 xadd [A0], T0_16
1307 mov [A1], T0_16
1308 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 EPILOGUE_3_ARGS
1310ENDPROC iemAImpl_xadd_u16
1311
1312BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0_32, [A1]
1316 xadd [A0], T0_32
1317 mov [A1], T0_32
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u32
1321
1322%ifdef RT_ARCH_AMD64
1323BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0, [A1]
1327 xadd [A0], T0
1328 mov [A1], T0
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u64
1332%endif ; RT_ARCH_AMD64
1333
1334BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1335 PROLOGUE_3_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 mov T0_8, [A1]
1338 lock xadd [A0], T0_8
1339 mov [A1], T0_8
1340 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1341 EPILOGUE_3_ARGS
1342ENDPROC iemAImpl_xadd_u8_locked
1343
1344BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1345 PROLOGUE_3_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1347 mov T0_16, [A1]
1348 lock xadd [A0], T0_16
1349 mov [A1], T0_16
1350 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 EPILOGUE_3_ARGS
1352ENDPROC iemAImpl_xadd_u16_locked
1353
1354BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0_32, [A1]
1358 lock xadd [A0], T0_32
1359 mov [A1], T0_32
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u32_locked
1363
1364%ifdef RT_ARCH_AMD64
1365BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1368 mov T0, [A1]
1369 lock xadd [A0], T0
1370 mov [A1], T0
1371 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_xadd_u64_locked
1374%endif ; RT_ARCH_AMD64
1375
1376
1377;
1378; CMPXCHG8B.
1379;
1380; These are tricky register wise, so the code is duplicated for each calling
1381; convention.
1382;
1383; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1384;
1385; C-proto:
1386; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1387; uint32_t *pEFlags));
1388;
1389; Note! Identical to iemAImpl_cmpxchg16b.
1390;
1391BEGINCODE
1392BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1393%ifdef RT_ARCH_AMD64
1394 %ifdef ASM_CALL64_MSC
1395 push rbx
1396
1397 mov r11, rdx ; pu64EaxEdx (is also T1)
1398 mov r10, rcx ; pu64Dst
1399
1400 mov ebx, [r8]
1401 mov ecx, [r8 + 4]
1402 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1403 mov eax, [r11]
1404 mov edx, [r11 + 4]
1405
1406 lock cmpxchg8b [r10]
1407
1408 mov [r11], eax
1409 mov [r11 + 4], edx
1410 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1411
1412 pop rbx
1413 ret
1414 %else
1415 push rbx
1416
1417 mov r10, rcx ; pEFlags
1418 mov r11, rdx ; pu64EbxEcx (is also T1)
1419
1420 mov ebx, [r11]
1421 mov ecx, [r11 + 4]
1422 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1423 mov eax, [rsi]
1424 mov edx, [rsi + 4]
1425
1426 lock cmpxchg8b [rdi]
1427
1428 mov [rsi], eax
1429 mov [rsi + 4], edx
1430 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1431
1432 pop rbx
1433 ret
1434
1435 %endif
1436%else
1437 push esi
1438 push edi
1439 push ebx
1440 push ebp
1441
1442 mov edi, ecx ; pu64Dst
1443 mov esi, edx ; pu64EaxEdx
1444 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1445 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1446
1447 mov ebx, [ecx]
1448 mov ecx, [ecx + 4]
1449 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1450 mov eax, [esi]
1451 mov edx, [esi + 4]
1452
1453 lock cmpxchg8b [edi]
1454
1455 mov [esi], eax
1456 mov [esi + 4], edx
1457 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1458
1459 pop ebp
1460 pop ebx
1461 pop edi
1462 pop esi
1463 ret 8
1464%endif
1465ENDPROC iemAImpl_cmpxchg8b
1466
1467BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1468 ; Lazy bird always lock prefixes cmpxchg8b.
1469 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1470ENDPROC iemAImpl_cmpxchg8b_locked
1471
1472%ifdef RT_ARCH_AMD64
1473
1474;
1475; CMPXCHG16B.
1476;
1477; These are tricky register wise, so the code is duplicated for each calling
1478; convention.
1479;
1480; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1481;
1482; C-proto:
1483; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1484; uint32_t *pEFlags));
1485;
1486; Note! Identical to iemAImpl_cmpxchg8b.
1487;
1488BEGINCODE
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1490 %ifdef ASM_CALL64_MSC
1491 push rbx
1492
1493 mov r11, rdx ; pu64RaxRdx (is also T1)
1494 mov r10, rcx ; pu64Dst
1495
1496 mov rbx, [r8]
1497 mov rcx, [r8 + 8]
1498 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1499 mov rax, [r11]
1500 mov rdx, [r11 + 8]
1501
1502 lock cmpxchg16b [r10]
1503
1504 mov [r11], rax
1505 mov [r11 + 8], rdx
1506 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1507
1508 pop rbx
1509 ret
1510 %else
1511 push rbx
1512
1513 mov r10, rcx ; pEFlags
1514 mov r11, rdx ; pu64RbxRcx (is also T1)
1515
1516 mov rbx, [r11]
1517 mov rcx, [r11 + 8]
1518 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1519 mov rax, [rsi]
1520 mov rdx, [rsi + 8]
1521
1522 lock cmpxchg16b [rdi]
1523
1524 mov [rsi], rax
1525 mov [rsi + 8], rdx
1526 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1527
1528 pop rbx
1529 ret
1530
1531 %endif
1532ENDPROC iemAImpl_cmpxchg16b
1533
1534BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1535 ; Lazy bird always lock prefixes cmpxchg16b.
1536 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1537ENDPROC iemAImpl_cmpxchg16b_locked
1538
1539%endif ; RT_ARCH_AMD64
1540
1541
1542;
1543; CMPXCHG.
1544;
1545; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1546;
1547; C-proto:
1548; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1549;
1550BEGINCODE
1551%macro IEMIMPL_CMPXCHG 2
1552BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1553 PROLOGUE_4_ARGS
1554 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1555 mov al, [A1]
1556 %1 cmpxchg [A0], A2_8
1557 mov [A1], al
1558 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1559 EPILOGUE_4_ARGS
1560ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1561
1562BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1563 PROLOGUE_4_ARGS
1564 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1565 mov ax, [A1]
1566 %1 cmpxchg [A0], A2_16
1567 mov [A1], ax
1568 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1569 EPILOGUE_4_ARGS
1570ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1571
1572BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1573 PROLOGUE_4_ARGS
1574 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1575 mov eax, [A1]
1576 %1 cmpxchg [A0], A2_32
1577 mov [A1], eax
1578 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1579 EPILOGUE_4_ARGS
1580ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1581
1582BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1583%ifdef RT_ARCH_AMD64
1584 PROLOGUE_4_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1586 mov rax, [A1]
1587 %1 cmpxchg [A0], A2
1588 mov [A1], rax
1589 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1590 EPILOGUE_4_ARGS
1591%else
1592 ;
1593 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1594 ;
1595 push esi
1596 push edi
1597 push ebx
1598 push ebp
1599
1600 mov edi, ecx ; pu64Dst
1601 mov esi, edx ; pu64Rax
1602 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1603 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1604
1605 mov ebx, [ecx]
1606 mov ecx, [ecx + 4]
1607 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1608 mov eax, [esi]
1609 mov edx, [esi + 4]
1610
1611 lock cmpxchg8b [edi]
1612
1613 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1614 jz .cmpxchg8b_not_equal
1615 cmp eax, eax ; just set the other flags.
1616.store:
1617 mov [esi], eax
1618 mov [esi + 4], edx
1619 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1620
1621 pop ebp
1622 pop ebx
1623 pop edi
1624 pop esi
1625 ret 8
1626
1627.cmpxchg8b_not_equal:
1628 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1629 jne .store
1630 cmp [esi], eax
1631 jmp .store
1632
1633%endif
1634ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1635%endmacro ; IEMIMPL_CMPXCHG
1636
1637IEMIMPL_CMPXCHG , ,
1638IEMIMPL_CMPXCHG lock, _locked
1639
1640;;
1641; Macro for implementing a unary operator.
1642;
1643; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1644; variants, except on 32-bit system where the 64-bit accesses requires hand
1645; coding.
1646;
1647; All the functions takes a pointer to the destination memory operand in A0,
1648; the source register operand in A1 and a pointer to eflags in A2.
1649;
1650; @param 1 The instruction mnemonic.
1651; @param 2 The modified flags.
1652; @param 3 The undefined flags.
1653;
1654%macro IEMIMPL_UNARY_OP 3
1655BEGINCODE
1656BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1657 PROLOGUE_2_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1659 %1 byte [A0]
1660 IEM_SAVE_FLAGS A1, %2, %3
1661 EPILOGUE_2_ARGS
1662ENDPROC iemAImpl_ %+ %1 %+ _u8
1663
1664BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1665 PROLOGUE_2_ARGS
1666 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1667 lock %1 byte [A0]
1668 IEM_SAVE_FLAGS A1, %2, %3
1669 EPILOGUE_2_ARGS
1670ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1671
1672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1673 PROLOGUE_2_ARGS
1674 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1675 %1 word [A0]
1676 IEM_SAVE_FLAGS A1, %2, %3
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_ %+ %1 %+ _u16
1679
1680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1683 lock %1 word [A0]
1684 IEM_SAVE_FLAGS A1, %2, %3
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 dword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u32
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 dword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1703
1704 %ifdef RT_ARCH_AMD64
1705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1706 PROLOGUE_2_ARGS
1707 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1708 %1 qword [A0]
1709 IEM_SAVE_FLAGS A1, %2, %3
1710 EPILOGUE_2_ARGS
1711ENDPROC iemAImpl_ %+ %1 %+ _u64
1712
1713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1714 PROLOGUE_2_ARGS
1715 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1716 lock %1 qword [A0]
1717 IEM_SAVE_FLAGS A1, %2, %3
1718 EPILOGUE_2_ARGS
1719ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1720 %endif ; RT_ARCH_AMD64
1721
1722%endmacro
1723
1724IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1725IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1726IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1727IEMIMPL_UNARY_OP not, 0, 0
1728
1729
1730;
1731; BSWAP. No flag changes.
1732;
1733; Each function takes one argument, pointer to the value to bswap
1734; (input/output). They all return void.
1735;
1736BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1737 PROLOGUE_1_ARGS
1738 mov T0_32, [A0] ; just in case any of the upper bits are used.
1739 db 66h
1740 bswap T0_32
1741 mov [A0], T0_32
1742 EPILOGUE_1_ARGS
1743ENDPROC iemAImpl_bswap_u16
1744
1745BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1746 PROLOGUE_1_ARGS
1747 mov T0_32, [A0]
1748 bswap T0_32
1749 mov [A0], T0_32
1750 EPILOGUE_1_ARGS
1751ENDPROC iemAImpl_bswap_u32
1752
1753BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1754%ifdef RT_ARCH_AMD64
1755 PROLOGUE_1_ARGS
1756 mov T0, [A0]
1757 bswap T0
1758 mov [A0], T0
1759 EPILOGUE_1_ARGS
1760%else
1761 PROLOGUE_1_ARGS
1762 mov T0, [A0]
1763 mov T1, [A0 + 4]
1764 bswap T0
1765 bswap T1
1766 mov [A0 + 4], T0
1767 mov [A0], T1
1768 EPILOGUE_1_ARGS
1769%endif
1770ENDPROC iemAImpl_bswap_u64
1771
1772
1773;;
1774; Macro for implementing a shift operation.
1775;
1776; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1777; 32-bit system where the 64-bit accesses requires hand coding.
1778;
1779; All the functions takes a pointer to the destination memory operand in A0,
1780; the shift count in A1 and a pointer to eflags in A2.
1781;
1782; @param 1 The instruction mnemonic.
1783; @param 2 The modified flags.
1784; @param 3 The undefined flags.
1785;
1786; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1787;
1788; @note the _intel and _amd variants are implemented in C.
1789;
1790%macro IEMIMPL_SHIFT_OP 3
1791BEGINCODE
1792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1793 PROLOGUE_3_ARGS
1794 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1795 %ifdef ASM_CALL64_GCC
1796 mov cl, A1_8
1797 %1 byte [A0], cl
1798 %else
1799 xchg A1, A0
1800 %1 byte [A1], cl
1801 %endif
1802 IEM_SAVE_FLAGS A2, %2, %3
1803 EPILOGUE_3_ARGS
1804ENDPROC iemAImpl_ %+ %1 %+ _u8
1805
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1807 PROLOGUE_3_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1809 %ifdef ASM_CALL64_GCC
1810 mov cl, A1_8
1811 %1 word [A0], cl
1812 %else
1813 xchg A1, A0
1814 %1 word [A1], cl
1815 %endif
1816 IEM_SAVE_FLAGS A2, %2, %3
1817 EPILOGUE_3_ARGS
1818ENDPROC iemAImpl_ %+ %1 %+ _u16
1819
1820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1821 PROLOGUE_3_ARGS
1822 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1823 %ifdef ASM_CALL64_GCC
1824 mov cl, A1_8
1825 %1 dword [A0], cl
1826 %else
1827 xchg A1, A0
1828 %1 dword [A1], cl
1829 %endif
1830 IEM_SAVE_FLAGS A2, %2, %3
1831 EPILOGUE_3_ARGS
1832ENDPROC iemAImpl_ %+ %1 %+ _u32
1833
1834 %ifdef RT_ARCH_AMD64
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1836 PROLOGUE_3_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 mov cl, A1_8
1840 %1 qword [A0], cl
1841 %else
1842 xchg A1, A0
1843 %1 qword [A1], cl
1844 %endif
1845 IEM_SAVE_FLAGS A2, %2, %3
1846 EPILOGUE_3_ARGS
1847ENDPROC iemAImpl_ %+ %1 %+ _u64
1848 %endif ; RT_ARCH_AMD64
1849
1850%endmacro
1851
1852IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1853IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1854IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1855IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1856IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1857IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1858IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1859
1860
1861;;
1862; Macro for implementing a double precision shift operation.
1863;
1864; This will generate code for the 16, 32 and 64 bit accesses, except on
1865; 32-bit system where the 64-bit accesses requires hand coding.
1866;
1867; The functions takes the destination operand (r/m) in A0, the source (reg) in
1868; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1869;
1870; @param 1 The instruction mnemonic.
1871; @param 2 The modified flags.
1872; @param 3 The undefined flags.
1873;
1874; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1875;
1876; @note the _intel and _amd variants are implemented in C.
1877;
1878%macro IEMIMPL_SHIFT_DBL_OP 3
1879BEGINCODE
1880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1881 PROLOGUE_4_ARGS
1882 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1883 %ifdef ASM_CALL64_GCC
1884 xchg A3, A2
1885 %1 [A0], A1_16, cl
1886 xchg A3, A2
1887 %else
1888 xchg A0, A2
1889 %1 [A2], A1_16, cl
1890 %endif
1891 IEM_SAVE_FLAGS A3, %2, %3
1892 EPILOGUE_4_ARGS
1893ENDPROC iemAImpl_ %+ %1 %+ _u16
1894
1895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1896 PROLOGUE_4_ARGS
1897 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1898 %ifdef ASM_CALL64_GCC
1899 xchg A3, A2
1900 %1 [A0], A1_32, cl
1901 xchg A3, A2
1902 %else
1903 xchg A0, A2
1904 %1 [A2], A1_32, cl
1905 %endif
1906 IEM_SAVE_FLAGS A3, %2, %3
1907 EPILOGUE_4_ARGS
1908ENDPROC iemAImpl_ %+ %1 %+ _u32
1909
1910 %ifdef RT_ARCH_AMD64
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1912 PROLOGUE_4_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1914 %ifdef ASM_CALL64_GCC
1915 xchg A3, A2
1916 %1 [A0], A1, cl
1917 xchg A3, A2
1918 %else
1919 xchg A0, A2
1920 %1 [A2], A1, cl
1921 %endif
1922 IEM_SAVE_FLAGS A3, %2, %3
1923 EPILOGUE_4_ARGS_EX 12
1924ENDPROC iemAImpl_ %+ %1 %+ _u64
1925 %endif ; RT_ARCH_AMD64
1926
1927%endmacro
1928
1929IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1930IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1931
1932
1933;;
1934; Macro for implementing a multiplication operations.
1935;
1936; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1937; 32-bit system where the 64-bit accesses requires hand coding.
1938;
1939; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1940; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1941; pointer to eflags in A3.
1942;
1943; The functions all return 0 so the caller can be used for div/idiv as well as
1944; for the mul/imul implementation.
1945;
1946; @param 1 The instruction mnemonic.
1947; @param 2 The modified flags.
1948; @param 3 The undefined flags.
1949; @param 4 Name suffix.
1950; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1951;
1952; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1953;
1954%macro IEMIMPL_MUL_OP 5
1955BEGINCODE
1956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1957 PROLOGUE_3_ARGS
1958 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1959 mov al, [A0]
1960 %1 A1_8
1961 mov [A0], ax
1962 %if %5 != 1
1963 IEM_SAVE_FLAGS A2, %2, %3
1964 %else
1965 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1966 %endif
1967 xor eax, eax
1968 EPILOGUE_3_ARGS
1969ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1970
1971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1972 PROLOGUE_4_ARGS
1973 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1974 mov ax, [A0]
1975 %ifdef ASM_CALL64_GCC
1976 %1 A2_16
1977 mov [A0], ax
1978 mov [A1], dx
1979 %else
1980 mov T1, A1
1981 %1 A2_16
1982 mov [A0], ax
1983 mov [T1], dx
1984 %endif
1985 %if %5 != 1
1986 IEM_SAVE_FLAGS A3, %2, %3
1987 %else
1988 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1989 %endif
1990 xor eax, eax
1991 EPILOGUE_4_ARGS
1992ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1993
1994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1995 PROLOGUE_4_ARGS
1996 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1997 mov eax, [A0]
1998 %ifdef ASM_CALL64_GCC
1999 %1 A2_32
2000 mov [A0], eax
2001 mov [A1], edx
2002 %else
2003 mov T1, A1
2004 %1 A2_32
2005 mov [A0], eax
2006 mov [T1], edx
2007 %endif
2008 %if %5 != 1
2009 IEM_SAVE_FLAGS A3, %2, %3
2010 %else
2011 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2012 %endif
2013 xor eax, eax
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2016
2017 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 mov rax, [A0]
2022 %ifdef ASM_CALL64_GCC
2023 %1 A2
2024 mov [A0], rax
2025 mov [A1], rdx
2026 %else
2027 mov T1, A1
2028 %1 A2
2029 mov [A0], rax
2030 mov [T1], rdx
2031 %endif
2032 %if %5 != 1
2033 IEM_SAVE_FLAGS A3, %2, %3
2034 %else
2035 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2036 %endif
2037 xor eax, eax
2038 EPILOGUE_4_ARGS_EX 12
2039ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2040 %endif ; !RT_ARCH_AMD64
2041
2042%endmacro
2043
2044IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2045IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2046IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2047IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2048IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2049IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2050
2051
2052BEGINCODE
2053;;
2054; Worker function for negating a 32-bit number in T1:T0
2055; @uses None (T0,T1)
2056BEGINPROC iemAImpl_negate_T0_T1_u32
2057 push 0
2058 push 0
2059 xchg T0_32, [xSP]
2060 xchg T1_32, [xSP + xCB]
2061 sub T0_32, [xSP]
2062 sbb T1_32, [xSP + xCB]
2063 add xSP, xCB*2
2064 ret
2065ENDPROC iemAImpl_negate_T0_T1_u32
2066
2067%ifdef RT_ARCH_AMD64
2068;;
2069; Worker function for negating a 64-bit number in T1:T0
2070; @uses None (T0,T1)
2071BEGINPROC iemAImpl_negate_T0_T1_u64
2072 push 0
2073 push 0
2074 xchg T0, [xSP]
2075 xchg T1, [xSP + xCB]
2076 sub T0, [xSP]
2077 sbb T1, [xSP + xCB]
2078 add xSP, xCB*2
2079 ret
2080ENDPROC iemAImpl_negate_T0_T1_u64
2081%endif
2082
2083
2084;;
2085; Macro for implementing a division operations.
2086;
2087; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2088; 32-bit system where the 64-bit accesses requires hand coding.
2089;
2090; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2091; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2092; pointer to eflags in A3.
2093;
2094; The functions all return 0 on success and -1 if a divide error should be
2095; raised by the caller.
2096;
2097; @param 1 The instruction mnemonic.
2098; @param 2 The modified flags.
2099; @param 3 The undefined flags.
2100; @param 4 1 if signed, 0 if unsigned.
2101; @param 5 Function suffix.
2102; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2103; 2 for AMD (set AF, clear PF, ZF and SF).
2104;
2105; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2106;
2107%macro IEMIMPL_DIV_OP 6
2108BEGINCODE
2109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2110 PROLOGUE_3_ARGS
2111
2112 ; div by chainsaw check.
2113 test A1_8, A1_8
2114 jz .div_zero
2115
2116 ; Overflow check - unsigned division is simple to verify, haven't
2117 ; found a simple way to check signed division yet unfortunately.
2118 %if %4 == 0
2119 cmp [A0 + 1], A1_8
2120 jae .div_overflow
2121 %else
2122 mov T0_16, [A0] ; T0 = dividend
2123 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2124 test A1_8, A1_8
2125 js .divisor_negative
2126 test T0_16, T0_16
2127 jns .both_positive
2128 neg T0_16
2129.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2130 push T0 ; Start off like unsigned below.
2131 shr T0_16, 7
2132 cmp T0_8, A1_8
2133 pop T0
2134 jb .div_no_overflow
2135 ja .div_overflow
2136 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2137 cmp T0_8, A1_8
2138 jae .div_overflow
2139 jmp .div_no_overflow
2140
2141.divisor_negative:
2142 neg A1_8
2143 test T0_16, T0_16
2144 jns .one_of_each
2145 neg T0_16
2146.both_positive: ; Same as unsigned shifted by sign indicator bit.
2147 shr T0_16, 7
2148 cmp T0_8, A1_8
2149 jae .div_overflow
2150.div_no_overflow:
2151 mov A1, T1 ; restore divisor
2152 %endif
2153
2154 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2155 mov ax, [A0]
2156 %1 A1_8
2157 mov [A0], ax
2158 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2159 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2160 %else
2161 IEM_SAVE_FLAGS A2, %2, %3
2162 %endif
2163 xor eax, eax
2164
2165.return:
2166 EPILOGUE_3_ARGS
2167
2168.div_zero:
2169.div_overflow:
2170 mov eax, -1
2171 jmp .return
2172ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2173
2174BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2175 PROLOGUE_4_ARGS
2176
2177 ; div by chainsaw check.
2178 test A2_16, A2_16
2179 jz .div_zero
2180
2181 ; Overflow check - unsigned division is simple to verify, haven't
2182 ; found a simple way to check signed division yet unfortunately.
2183 %if %4 == 0
2184 cmp [A1], A2_16
2185 jae .div_overflow
2186 %else
2187 mov T0_16, [A1]
2188 shl T0_32, 16
2189 mov T0_16, [A0] ; T0 = dividend
2190 mov T1, A2 ; T1 = divisor
2191 test T1_16, T1_16
2192 js .divisor_negative
2193 test T0_32, T0_32
2194 jns .both_positive
2195 neg T0_32
2196.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2197 push T0 ; Start off like unsigned below.
2198 shr T0_32, 15
2199 cmp T0_16, T1_16
2200 pop T0
2201 jb .div_no_overflow
2202 ja .div_overflow
2203 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2204 cmp T0_16, T1_16
2205 jae .div_overflow
2206 jmp .div_no_overflow
2207
2208.divisor_negative:
2209 neg T1_16
2210 test T0_32, T0_32
2211 jns .one_of_each
2212 neg T0_32
2213.both_positive: ; Same as unsigned shifted by sign indicator bit.
2214 shr T0_32, 15
2215 cmp T0_16, T1_16
2216 jae .div_overflow
2217.div_no_overflow:
2218 %endif
2219
2220 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2221 %ifdef ASM_CALL64_GCC
2222 mov T1, A2
2223 mov ax, [A0]
2224 mov dx, [A1]
2225 %1 T1_16
2226 mov [A0], ax
2227 mov [A1], dx
2228 %else
2229 mov T1, A1
2230 mov ax, [A0]
2231 mov dx, [T1]
2232 %1 A2_16
2233 mov [A0], ax
2234 mov [T1], dx
2235 %endif
2236 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2237 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2238 %else
2239 IEM_SAVE_FLAGS A3, %2, %3
2240 %endif
2241 xor eax, eax
2242
2243.return:
2244 EPILOGUE_4_ARGS
2245
2246.div_zero:
2247.div_overflow:
2248 mov eax, -1
2249 jmp .return
2250ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2251
2252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2253 PROLOGUE_4_ARGS
2254
2255 ; div by chainsaw check.
2256 test A2_32, A2_32
2257 jz .div_zero
2258
2259 ; Overflow check - unsigned division is simple to verify, haven't
2260 ; found a simple way to check signed division yet unfortunately.
2261 %if %4 == 0
2262 cmp [A1], A2_32
2263 jae .div_overflow
2264 %else
2265 push A2 ; save A2 so we modify it (we out of regs on x86).
2266 mov T0_32, [A0] ; T0 = dividend low
2267 mov T1_32, [A1] ; T1 = dividend high
2268 test A2_32, A2_32
2269 js .divisor_negative
2270 test T1_32, T1_32
2271 jns .both_positive
2272 call NAME(iemAImpl_negate_T0_T1_u32)
2273.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2274 push T0 ; Start off like unsigned below.
2275 shl T1_32, 1
2276 shr T0_32, 31
2277 or T1_32, T0_32
2278 cmp T1_32, A2_32
2279 pop T0
2280 jb .div_no_overflow
2281 ja .div_overflow
2282 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2283 cmp T0_32, A2_32
2284 jae .div_overflow
2285 jmp .div_no_overflow
2286
2287.divisor_negative:
2288 neg A2_32
2289 test T1_32, T1_32
2290 jns .one_of_each
2291 call NAME(iemAImpl_negate_T0_T1_u32)
2292.both_positive: ; Same as unsigned shifted by sign indicator bit.
2293 shl T1_32, 1
2294 shr T0_32, 31
2295 or T1_32, T0_32
2296 cmp T1_32, A2_32
2297 jae .div_overflow
2298.div_no_overflow:
2299 pop A2
2300 %endif
2301
2302 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2303 mov eax, [A0]
2304 %ifdef ASM_CALL64_GCC
2305 mov T1, A2
2306 mov eax, [A0]
2307 mov edx, [A1]
2308 %1 T1_32
2309 mov [A0], eax
2310 mov [A1], edx
2311 %else
2312 mov T1, A1
2313 mov eax, [A0]
2314 mov edx, [T1]
2315 %1 A2_32
2316 mov [A0], eax
2317 mov [T1], edx
2318 %endif
2319 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2320 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2321 %else
2322 IEM_SAVE_FLAGS A3, %2, %3
2323 %endif
2324 xor eax, eax
2325
2326.return:
2327 EPILOGUE_4_ARGS
2328
2329.div_overflow:
2330 %if %4 != 0
2331 pop A2
2332 %endif
2333.div_zero:
2334 mov eax, -1
2335 jmp .return
2336ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2337
2338 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2340 PROLOGUE_4_ARGS
2341
2342 test A2, A2
2343 jz .div_zero
2344 %if %4 == 0
2345 cmp [A1], A2
2346 jae .div_overflow
2347 %else
2348 push A2 ; save A2 so we modify it (we out of regs on x86).
2349 mov T0, [A0] ; T0 = dividend low
2350 mov T1, [A1] ; T1 = dividend high
2351 test A2, A2
2352 js .divisor_negative
2353 test T1, T1
2354 jns .both_positive
2355 call NAME(iemAImpl_negate_T0_T1_u64)
2356.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2357 push T0 ; Start off like unsigned below.
2358 shl T1, 1
2359 shr T0, 63
2360 or T1, T0
2361 cmp T1, A2
2362 pop T0
2363 jb .div_no_overflow
2364 ja .div_overflow
2365 mov T1, 0x7fffffffffffffff
2366 and T0, T1 ; Special case for covering (divisor - 1).
2367 cmp T0, A2
2368 jae .div_overflow
2369 jmp .div_no_overflow
2370
2371.divisor_negative:
2372 neg A2
2373 test T1, T1
2374 jns .one_of_each
2375 call NAME(iemAImpl_negate_T0_T1_u64)
2376.both_positive: ; Same as unsigned shifted by sign indicator bit.
2377 shl T1, 1
2378 shr T0, 63
2379 or T1, T0
2380 cmp T1, A2
2381 jae .div_overflow
2382.div_no_overflow:
2383 pop A2
2384 %endif
2385
2386 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2387 mov rax, [A0]
2388 %ifdef ASM_CALL64_GCC
2389 mov T1, A2
2390 mov rax, [A0]
2391 mov rdx, [A1]
2392 %1 T1
2393 mov [A0], rax
2394 mov [A1], rdx
2395 %else
2396 mov T1, A1
2397 mov rax, [A0]
2398 mov rdx, [T1]
2399 %1 A2
2400 mov [A0], rax
2401 mov [T1], rdx
2402 %endif
2403 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2404 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2405 %else
2406 IEM_SAVE_FLAGS A3, %2, %3
2407 %endif
2408 xor eax, eax
2409
2410.return:
2411 EPILOGUE_4_ARGS_EX 12
2412
2413.div_overflow:
2414 %if %4 != 0
2415 pop A2
2416 %endif
2417.div_zero:
2418 mov eax, -1
2419 jmp .return
2420ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2421 %endif ; !RT_ARCH_AMD64
2422
2423%endmacro
2424
2425IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2426IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2427IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2428IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2429IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2430IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2431
2432
2433;;
2434; Macro for implementing memory fence operation.
2435;
2436; No return value, no operands or anything.
2437;
2438; @param 1 The instruction.
2439;
2440%macro IEMIMPL_MEM_FENCE 1
2441BEGINCODE
2442BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2443 %1
2444 ret
2445ENDPROC iemAImpl_ %+ %1
2446%endmacro
2447
2448IEMIMPL_MEM_FENCE lfence
2449IEMIMPL_MEM_FENCE sfence
2450IEMIMPL_MEM_FENCE mfence
2451
2452;;
2453; Alternative for non-SSE2 host.
2454;
2455BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2456 push xAX
2457 xchg xAX, [xSP]
2458 add xSP, xCB
2459 ret
2460ENDPROC iemAImpl_alt_mem_fence
2461
2462
2463;;
2464; Initialize the FPU for the actual instruction being emulated, this means
2465; loading parts of the guest's control word and status word.
2466;
2467; @uses 24 bytes of stack. T0, T1
2468; @param 1 Expression giving the address of the FXSTATE of the guest.
2469;
2470%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2471 fnstenv [xSP]
2472
2473 ; FCW - for exception, precision and rounding control.
2474 movzx T0, word [%1 + X86FXSTATE.FCW]
2475 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2476 mov [xSP + X86FSTENV32P.FCW], T0_16
2477
2478 ; FSW - for undefined C0, C1, C2, and C3.
2479 movzx T1, word [%1 + X86FXSTATE.FSW]
2480 and T1, X86_FSW_C_MASK
2481 movzx T0, word [xSP + X86FSTENV32P.FSW]
2482 and T0, X86_FSW_TOP_MASK
2483 or T0, T1
2484 mov [xSP + X86FSTENV32P.FSW], T0_16
2485
2486 fldenv [xSP]
2487%endmacro
2488
2489
2490;;
2491; Initialize the FPU for the actual instruction being emulated, this means
2492; loading parts of the guest's control word, status word, and update the
2493; tag word for the top register if it's empty.
2494;
2495; ASSUMES actual TOP=7
2496;
2497; @uses 24 bytes of stack. T0, T1
2498; @param 1 Expression giving the address of the FXSTATE of the guest.
2499;
2500%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2501 fnstenv [xSP]
2502
2503 ; FCW - for exception, precision and rounding control.
2504 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2505 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2506 mov [xSP + X86FSTENV32P.FCW], T0_16
2507
2508 ; FSW - for undefined C0, C1, C2, and C3.
2509 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2510 and T1_32, X86_FSW_C_MASK
2511 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2512 and T0_32, X86_FSW_TOP_MASK
2513 or T0_32, T1_32
2514 mov [xSP + X86FSTENV32P.FSW], T0_16
2515
2516 ; FTW - Only for ST0 (in/out).
2517 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2518 shr T1_32, X86_FSW_TOP_SHIFT
2519 and T1_32, X86_FSW_TOP_SMASK
2520 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2521 jc %%st0_not_empty
2522 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2523%%st0_not_empty:
2524
2525 fldenv [xSP]
2526%endmacro
2527
2528
2529;;
2530; Need to move this as well somewhere better?
2531;
2532struc IEMFPURESULT
2533 .r80Result resw 5
2534 .FSW resw 1
2535endstruc
2536
2537
2538;;
2539; Need to move this as well somewhere better?
2540;
2541struc IEMFPURESULTTWO
2542 .r80Result1 resw 5
2543 .FSW resw 1
2544 .r80Result2 resw 5
2545endstruc
2546
2547
2548;
2549;---------------------- 16-bit signed integer operations ----------------------
2550;
2551
2552
2553;;
2554; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2555;
2556; @param A0 FPU context (fxsave).
2557; @param A1 Pointer to a IEMFPURESULT for the output.
2558; @param A2 Pointer to the 16-bit floating point value to convert.
2559;
2560BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2561 PROLOGUE_3_ARGS
2562 sub xSP, 20h
2563
2564 fninit
2565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2566 fild word [A2]
2567
2568 fnstsw word [A1 + IEMFPURESULT.FSW]
2569 fnclex
2570 fstp tword [A1 + IEMFPURESULT.r80Result]
2571
2572 fninit
2573 add xSP, 20h
2574 EPILOGUE_3_ARGS
2575ENDPROC iemAImpl_fild_r80_from_i16
2576
2577
2578;;
2579; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2580;
2581; @param A0 FPU context (fxsave).
2582; @param A1 Where to return the output FSW.
2583; @param A2 Where to store the 16-bit signed integer value.
2584; @param A3 Pointer to the 80-bit value.
2585;
2586BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2587 PROLOGUE_4_ARGS
2588 sub xSP, 20h
2589
2590 fninit
2591 fld tword [A3]
2592 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2593 fistp word [A2]
2594
2595 fnstsw word [A1]
2596
2597 fninit
2598 add xSP, 20h
2599 EPILOGUE_4_ARGS
2600ENDPROC iemAImpl_fist_r80_to_i16
2601
2602
2603;;
2604; Store a 80-bit floating point value (register) as a 16-bit signed integer
2605; (memory) with truncation.
2606;
2607; @param A0 FPU context (fxsave).
2608; @param A1 Where to return the output FSW.
2609; @param A2 Where to store the 16-bit signed integer value.
2610; @param A3 Pointer to the 80-bit value.
2611;
2612BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2613 PROLOGUE_4_ARGS
2614 sub xSP, 20h
2615
2616 fninit
2617 fld tword [A3]
2618 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2619 fisttp word [A2]
2620
2621 fnstsw word [A1]
2622
2623 fninit
2624 add xSP, 20h
2625 EPILOGUE_4_ARGS
2626ENDPROC iemAImpl_fistt_r80_to_i16
2627
2628
2629;;
2630; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2631;
2632; @param 1 The instruction
2633;
2634; @param A0 FPU context (fxsave).
2635; @param A1 Pointer to a IEMFPURESULT for the output.
2636; @param A2 Pointer to the 80-bit value.
2637; @param A3 Pointer to the 16-bit value.
2638;
2639%macro IEMIMPL_FPU_R80_BY_I16 1
2640BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2641 PROLOGUE_4_ARGS
2642 sub xSP, 20h
2643
2644 fninit
2645 fld tword [A2]
2646 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2647 %1 word [A3]
2648
2649 fnstsw word [A1 + IEMFPURESULT.FSW]
2650 fnclex
2651 fstp tword [A1 + IEMFPURESULT.r80Result]
2652
2653 fninit
2654 add xSP, 20h
2655 EPILOGUE_4_ARGS
2656ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2657%endmacro
2658
2659IEMIMPL_FPU_R80_BY_I16 fiadd
2660IEMIMPL_FPU_R80_BY_I16 fimul
2661IEMIMPL_FPU_R80_BY_I16 fisub
2662IEMIMPL_FPU_R80_BY_I16 fisubr
2663IEMIMPL_FPU_R80_BY_I16 fidiv
2664IEMIMPL_FPU_R80_BY_I16 fidivr
2665
2666
2667;;
2668; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2669; only returning FSW.
2670;
2671; @param 1 The instruction
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Where to store the output FSW.
2675; @param A2 Pointer to the 80-bit value.
2676; @param A3 Pointer to the 64-bit value.
2677;
2678%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2680 PROLOGUE_4_ARGS
2681 sub xSP, 20h
2682
2683 fninit
2684 fld tword [A2]
2685 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2686 %1 word [A3]
2687
2688 fnstsw word [A1]
2689
2690 fninit
2691 add xSP, 20h
2692 EPILOGUE_4_ARGS
2693ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2694%endmacro
2695
2696IEMIMPL_FPU_R80_BY_I16_FSW ficom
2697
2698
2699
2700;
2701;---------------------- 32-bit signed integer operations ----------------------
2702;
2703
2704
2705;;
2706; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Pointer to a IEMFPURESULT for the output.
2710; @param A2 Pointer to the 32-bit floating point value to convert.
2711;
2712BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2713 PROLOGUE_3_ARGS
2714 sub xSP, 20h
2715
2716 fninit
2717 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2718 fild dword [A2]
2719
2720 fnstsw word [A1 + IEMFPURESULT.FSW]
2721 fnclex
2722 fstp tword [A1 + IEMFPURESULT.r80Result]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_3_ARGS
2727ENDPROC iemAImpl_fild_r80_from_i32
2728
2729
2730;;
2731; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2732;
2733; @param A0 FPU context (fxsave).
2734; @param A1 Where to return the output FSW.
2735; @param A2 Where to store the 32-bit signed integer value.
2736; @param A3 Pointer to the 80-bit value.
2737;
2738BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2739 PROLOGUE_4_ARGS
2740 sub xSP, 20h
2741
2742 fninit
2743 fld tword [A3]
2744 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2745 fistp dword [A2]
2746
2747 fnstsw word [A1]
2748
2749 fninit
2750 add xSP, 20h
2751 EPILOGUE_4_ARGS
2752ENDPROC iemAImpl_fist_r80_to_i32
2753
2754
2755;;
2756; Store a 80-bit floating point value (register) as a 32-bit signed integer
2757; (memory) with truncation.
2758;
2759; @param A0 FPU context (fxsave).
2760; @param A1 Where to return the output FSW.
2761; @param A2 Where to store the 32-bit signed integer value.
2762; @param A3 Pointer to the 80-bit value.
2763;
2764BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2765 PROLOGUE_4_ARGS
2766 sub xSP, 20h
2767
2768 fninit
2769 fld tword [A3]
2770 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2771 fisttp dword [A2]
2772
2773 fnstsw word [A1]
2774
2775 fninit
2776 add xSP, 20h
2777 EPILOGUE_4_ARGS
2778ENDPROC iemAImpl_fistt_r80_to_i32
2779
2780
2781;;
2782; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2783;
2784; @param 1 The instruction
2785;
2786; @param A0 FPU context (fxsave).
2787; @param A1 Pointer to a IEMFPURESULT for the output.
2788; @param A2 Pointer to the 80-bit value.
2789; @param A3 Pointer to the 32-bit value.
2790;
2791%macro IEMIMPL_FPU_R80_BY_I32 1
2792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2793 PROLOGUE_4_ARGS
2794 sub xSP, 20h
2795
2796 fninit
2797 fld tword [A2]
2798 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2799 %1 dword [A3]
2800
2801 fnstsw word [A1 + IEMFPURESULT.FSW]
2802 fnclex
2803 fstp tword [A1 + IEMFPURESULT.r80Result]
2804
2805 fninit
2806 add xSP, 20h
2807 EPILOGUE_4_ARGS
2808ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2809%endmacro
2810
2811IEMIMPL_FPU_R80_BY_I32 fiadd
2812IEMIMPL_FPU_R80_BY_I32 fimul
2813IEMIMPL_FPU_R80_BY_I32 fisub
2814IEMIMPL_FPU_R80_BY_I32 fisubr
2815IEMIMPL_FPU_R80_BY_I32 fidiv
2816IEMIMPL_FPU_R80_BY_I32 fidivr
2817
2818
2819;;
2820; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2821; only returning FSW.
2822;
2823; @param 1 The instruction
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Where to store the output FSW.
2827; @param A2 Pointer to the 80-bit value.
2828; @param A3 Pointer to the 64-bit value.
2829;
2830%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2831BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2832 PROLOGUE_4_ARGS
2833 sub xSP, 20h
2834
2835 fninit
2836 fld tword [A2]
2837 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2838 %1 dword [A3]
2839
2840 fnstsw word [A1]
2841
2842 fninit
2843 add xSP, 20h
2844 EPILOGUE_4_ARGS
2845ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2846%endmacro
2847
2848IEMIMPL_FPU_R80_BY_I32_FSW ficom
2849
2850
2851
2852;
2853;---------------------- 64-bit signed integer operations ----------------------
2854;
2855
2856
2857;;
2858; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Pointer to a IEMFPURESULT for the output.
2862; @param A2 Pointer to the 64-bit floating point value to convert.
2863;
2864BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2865 PROLOGUE_3_ARGS
2866 sub xSP, 20h
2867
2868 fninit
2869 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2870 fild qword [A2]
2871
2872 fnstsw word [A1 + IEMFPURESULT.FSW]
2873 fnclex
2874 fstp tword [A1 + IEMFPURESULT.r80Result]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_3_ARGS
2879ENDPROC iemAImpl_fild_r80_from_i64
2880
2881
2882;;
2883; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2884;
2885; @param A0 FPU context (fxsave).
2886; @param A1 Where to return the output FSW.
2887; @param A2 Where to store the 64-bit signed integer value.
2888; @param A3 Pointer to the 80-bit value.
2889;
2890BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2891 PROLOGUE_4_ARGS
2892 sub xSP, 20h
2893
2894 fninit
2895 fld tword [A3]
2896 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2897 fistp qword [A2]
2898
2899 fnstsw word [A1]
2900
2901 fninit
2902 add xSP, 20h
2903 EPILOGUE_4_ARGS
2904ENDPROC iemAImpl_fist_r80_to_i64
2905
2906
2907;;
2908; Store a 80-bit floating point value (register) as a 64-bit signed integer
2909; (memory) with truncation.
2910;
2911; @param A0 FPU context (fxsave).
2912; @param A1 Where to return the output FSW.
2913; @param A2 Where to store the 64-bit signed integer value.
2914; @param A3 Pointer to the 80-bit value.
2915;
2916BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2917 PROLOGUE_4_ARGS
2918 sub xSP, 20h
2919
2920 fninit
2921 fld tword [A3]
2922 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2923 fisttp qword [A2]
2924
2925 fnstsw word [A1]
2926
2927 fninit
2928 add xSP, 20h
2929 EPILOGUE_4_ARGS
2930ENDPROC iemAImpl_fistt_r80_to_i64
2931
2932
2933
2934;
2935;---------------------- 32-bit floating point operations ----------------------
2936;
2937
2938;;
2939; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Pointer to a IEMFPURESULT for the output.
2943; @param A2 Pointer to the 32-bit floating point value to convert.
2944;
2945BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2946 PROLOGUE_3_ARGS
2947 sub xSP, 20h
2948
2949 fninit
2950 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2951 fld dword [A2]
2952
2953 fnstsw word [A1 + IEMFPURESULT.FSW]
2954 fnclex
2955 fstp tword [A1 + IEMFPURESULT.r80Result]
2956
2957 fninit
2958 add xSP, 20h
2959 EPILOGUE_3_ARGS
2960ENDPROC iemAImpl_fld_r80_from_r32
2961
2962
2963;;
2964; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2965;
2966; @param A0 FPU context (fxsave).
2967; @param A1 Where to return the output FSW.
2968; @param A2 Where to store the 32-bit value.
2969; @param A3 Pointer to the 80-bit value.
2970;
2971BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2972 PROLOGUE_4_ARGS
2973 sub xSP, 20h
2974
2975 fninit
2976 fld tword [A3]
2977 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2978 fst dword [A2]
2979
2980 fnstsw word [A1]
2981
2982 fninit
2983 add xSP, 20h
2984 EPILOGUE_4_ARGS
2985ENDPROC iemAImpl_fst_r80_to_r32
2986
2987
2988;;
2989; FPU instruction working on one 80-bit and one 32-bit floating point value.
2990;
2991; @param 1 The instruction
2992;
2993; @param A0 FPU context (fxsave).
2994; @param A1 Pointer to a IEMFPURESULT for the output.
2995; @param A2 Pointer to the 80-bit value.
2996; @param A3 Pointer to the 32-bit value.
2997;
2998%macro IEMIMPL_FPU_R80_BY_R32 1
2999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3000 PROLOGUE_4_ARGS
3001 sub xSP, 20h
3002
3003 fninit
3004 fld tword [A2]
3005 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3006 %1 dword [A3]
3007
3008 fnstsw word [A1 + IEMFPURESULT.FSW]
3009 fnclex
3010 fstp tword [A1 + IEMFPURESULT.r80Result]
3011
3012 fninit
3013 add xSP, 20h
3014 EPILOGUE_4_ARGS
3015ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3016%endmacro
3017
3018IEMIMPL_FPU_R80_BY_R32 fadd
3019IEMIMPL_FPU_R80_BY_R32 fmul
3020IEMIMPL_FPU_R80_BY_R32 fsub
3021IEMIMPL_FPU_R80_BY_R32 fsubr
3022IEMIMPL_FPU_R80_BY_R32 fdiv
3023IEMIMPL_FPU_R80_BY_R32 fdivr
3024
3025
3026;;
3027; FPU instruction working on one 80-bit and one 32-bit floating point value,
3028; only returning FSW.
3029;
3030; @param 1 The instruction
3031;
3032; @param A0 FPU context (fxsave).
3033; @param A1 Where to store the output FSW.
3034; @param A2 Pointer to the 80-bit value.
3035; @param A3 Pointer to the 64-bit value.
3036;
3037%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3039 PROLOGUE_4_ARGS
3040 sub xSP, 20h
3041
3042 fninit
3043 fld tword [A2]
3044 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3045 %1 dword [A3]
3046
3047 fnstsw word [A1]
3048
3049 fninit
3050 add xSP, 20h
3051 EPILOGUE_4_ARGS
3052ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3053%endmacro
3054
3055IEMIMPL_FPU_R80_BY_R32_FSW fcom
3056
3057
3058
3059;
3060;---------------------- 64-bit floating point operations ----------------------
3061;
3062
3063;;
3064; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3065;
3066; @param A0 FPU context (fxsave).
3067; @param A1 Pointer to a IEMFPURESULT for the output.
3068; @param A2 Pointer to the 64-bit floating point value to convert.
3069;
3070BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3071 PROLOGUE_3_ARGS
3072 sub xSP, 20h
3073
3074 fninit
3075 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3076 fld qword [A2]
3077
3078 fnstsw word [A1 + IEMFPURESULT.FSW]
3079 fnclex
3080 fstp tword [A1 + IEMFPURESULT.r80Result]
3081
3082 fninit
3083 add xSP, 20h
3084 EPILOGUE_3_ARGS
3085ENDPROC iemAImpl_fld_r80_from_r64
3086
3087
3088;;
3089; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3090;
3091; @param A0 FPU context (fxsave).
3092; @param A1 Where to return the output FSW.
3093; @param A2 Where to store the 64-bit value.
3094; @param A3 Pointer to the 80-bit value.
3095;
3096BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3097 PROLOGUE_4_ARGS
3098 sub xSP, 20h
3099
3100 fninit
3101 fld tword [A3]
3102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3103 fst qword [A2]
3104
3105 fnstsw word [A1]
3106
3107 fninit
3108 add xSP, 20h
3109 EPILOGUE_4_ARGS
3110ENDPROC iemAImpl_fst_r80_to_r64
3111
3112
3113;;
3114; FPU instruction working on one 80-bit and one 64-bit floating point value.
3115;
3116; @param 1 The instruction
3117;
3118; @param A0 FPU context (fxsave).
3119; @param A1 Pointer to a IEMFPURESULT for the output.
3120; @param A2 Pointer to the 80-bit value.
3121; @param A3 Pointer to the 64-bit value.
3122;
3123%macro IEMIMPL_FPU_R80_BY_R64 1
3124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3125 PROLOGUE_4_ARGS
3126 sub xSP, 20h
3127
3128 fninit
3129 fld tword [A2]
3130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3131 %1 qword [A3]
3132
3133 fnstsw word [A1 + IEMFPURESULT.FSW]
3134 fnclex
3135 fstp tword [A1 + IEMFPURESULT.r80Result]
3136
3137 fninit
3138 add xSP, 20h
3139 EPILOGUE_4_ARGS
3140ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3141%endmacro
3142
3143IEMIMPL_FPU_R80_BY_R64 fadd
3144IEMIMPL_FPU_R80_BY_R64 fmul
3145IEMIMPL_FPU_R80_BY_R64 fsub
3146IEMIMPL_FPU_R80_BY_R64 fsubr
3147IEMIMPL_FPU_R80_BY_R64 fdiv
3148IEMIMPL_FPU_R80_BY_R64 fdivr
3149
3150;;
3151; FPU instruction working on one 80-bit and one 64-bit floating point value,
3152; only returning FSW.
3153;
3154; @param 1 The instruction
3155;
3156; @param A0 FPU context (fxsave).
3157; @param A1 Where to store the output FSW.
3158; @param A2 Pointer to the 80-bit value.
3159; @param A3 Pointer to the 64-bit value.
3160;
3161%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3163 PROLOGUE_4_ARGS
3164 sub xSP, 20h
3165
3166 fninit
3167 fld tword [A2]
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 %1 qword [A3]
3170
3171 fnstsw word [A1]
3172
3173 fninit
3174 add xSP, 20h
3175 EPILOGUE_4_ARGS
3176ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3177%endmacro
3178
3179IEMIMPL_FPU_R80_BY_R64_FSW fcom
3180
3181
3182
3183;
3184;---------------------- 80-bit floating point operations ----------------------
3185;
3186
3187;;
3188; Loads a 80-bit floating point register value from memory.
3189;
3190; @param A0 FPU context (fxsave).
3191; @param A1 Pointer to a IEMFPURESULT for the output.
3192; @param A2 Pointer to the 80-bit floating point value to load.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3195 PROLOGUE_3_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3200 fld tword [A2]
3201
3202 fnstsw word [A1 + IEMFPURESULT.FSW]
3203 fnclex
3204 fstp tword [A1 + IEMFPURESULT.r80Result]
3205
3206 fninit
3207 add xSP, 20h
3208 EPILOGUE_3_ARGS
3209ENDPROC iemAImpl_fld_r80_from_r80
3210
3211
3212;;
3213; Store a 80-bit floating point register to memory
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 80-bit value.
3218; @param A3 Pointer to the 80-bit register value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fstp tword [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fst_r80_to_r80
3235
3236
3237;;
3238; Loads an 80-bit floating point register value in BCD format from memory.
3239;
3240; @param A0 FPU context (fxsave).
3241; @param A1 Pointer to a IEMFPURESULT for the output.
3242; @param A2 Pointer to the 80-bit BCD value to load.
3243;
3244BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3245 PROLOGUE_3_ARGS
3246 sub xSP, 20h
3247
3248 fninit
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 fbld tword [A2]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_3_ARGS
3259ENDPROC iemAImpl_fld_r80_from_d80
3260
3261
3262;;
3263; Store a 80-bit floating point register to memory as BCD
3264;
3265; @param A0 FPU context (fxsave).
3266; @param A1 Where to return the output FSW.
3267; @param A2 Where to store the 80-bit BCD value.
3268; @param A3 Pointer to the 80-bit register value.
3269;
3270BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3271 PROLOGUE_4_ARGS
3272 sub xSP, 20h
3273
3274 fninit
3275 fld tword [A3]
3276 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3277 fbstp tword [A2]
3278
3279 fnstsw word [A1]
3280
3281 fninit
3282 add xSP, 20h
3283 EPILOGUE_4_ARGS
3284ENDPROC iemAImpl_fst_r80_to_d80
3285
3286
3287;;
3288; FPU instruction working on two 80-bit floating point values.
3289;
3290; @param 1 The instruction
3291;
3292; @param A0 FPU context (fxsave).
3293; @param A1 Pointer to a IEMFPURESULT for the output.
3294; @param A2 Pointer to the first 80-bit value (ST0)
3295; @param A3 Pointer to the second 80-bit value (STn).
3296;
3297%macro IEMIMPL_FPU_R80_BY_R80 2
3298BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3299 PROLOGUE_4_ARGS
3300 sub xSP, 20h
3301
3302 fninit
3303 fld tword [A3]
3304 fld tword [A2]
3305 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3306 %1 %2
3307
3308 fnstsw word [A1 + IEMFPURESULT.FSW]
3309 fnclex
3310 fstp tword [A1 + IEMFPURESULT.r80Result]
3311
3312 fninit
3313 add xSP, 20h
3314 EPILOGUE_4_ARGS
3315ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3316%endmacro
3317
3318IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3319IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3320IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3321IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3322IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3323IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3324IEMIMPL_FPU_R80_BY_R80 fprem, {}
3325IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3326IEMIMPL_FPU_R80_BY_R80 fscale, {}
3327
3328
3329;;
3330; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3331; storing the result in ST1 and popping the stack.
3332;
3333; @param 1 The instruction
3334;
3335; @param A0 FPU context (fxsave).
3336; @param A1 Pointer to a IEMFPURESULT for the output.
3337; @param A2 Pointer to the first 80-bit value (ST1).
3338; @param A3 Pointer to the second 80-bit value (ST0).
3339;
3340%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A2]
3347 fld tword [A3]
3348 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3349 %1
3350
3351 fnstsw word [A1 + IEMFPURESULT.FSW]
3352 fnclex
3353 fstp tword [A1 + IEMFPURESULT.r80Result]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3362IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3363IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3364
3365
3366;;
3367; FPU instruction working on two 80-bit floating point values, only
3368; returning FSW.
3369;
3370; @param 1 The instruction
3371;
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st0, st1
3387
3388 fnstsw word [A1]
3389
3390 fninit
3391 add xSP, 20h
3392 EPILOGUE_4_ARGS
3393ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3394%endmacro
3395
3396IEMIMPL_FPU_R80_BY_R80_FSW fcom
3397IEMIMPL_FPU_R80_BY_R80_FSW fucom
3398
3399
3400;;
3401; FPU instruction working on two 80-bit floating point values,
3402; returning FSW and EFLAGS (eax).
3403;
3404; @param 1 The instruction
3405;
3406; @returns EFLAGS in EAX.
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a uint16_t for the resulting FSW.
3409; @param A2 Pointer to the first 80-bit value.
3410; @param A3 Pointer to the second 80-bit value.
3411;
3412%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3413BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3414 PROLOGUE_4_ARGS
3415 sub xSP, 20h
3416
3417 fninit
3418 fld tword [A3]
3419 fld tword [A2]
3420 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3421 %1 st1
3422
3423 fnstsw word [A1]
3424 pushf
3425 pop xAX
3426
3427 fninit
3428 add xSP, 20h
3429 EPILOGUE_4_ARGS
3430ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3431%endmacro
3432
3433IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3434IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3435
3436
3437;;
3438; FPU instruction working on one 80-bit floating point value.
3439;
3440; @param 1 The instruction
3441;
3442; @param A0 FPU context (fxsave).
3443; @param A1 Pointer to a IEMFPURESULT for the output.
3444; @param A2 Pointer to the 80-bit value.
3445;
3446%macro IEMIMPL_FPU_R80 1
3447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3448 PROLOGUE_3_ARGS
3449 sub xSP, 20h
3450
3451 fninit
3452 fld tword [A2]
3453 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3454 %1
3455
3456 fnstsw word [A1 + IEMFPURESULT.FSW]
3457 fnclex
3458 fstp tword [A1 + IEMFPURESULT.r80Result]
3459
3460 fninit
3461 add xSP, 20h
3462 EPILOGUE_3_ARGS
3463ENDPROC iemAImpl_ %+ %1 %+ _r80
3464%endmacro
3465
3466IEMIMPL_FPU_R80 fchs
3467IEMIMPL_FPU_R80 fabs
3468IEMIMPL_FPU_R80 f2xm1
3469IEMIMPL_FPU_R80 fsqrt
3470IEMIMPL_FPU_R80 frndint
3471IEMIMPL_FPU_R80 fsin
3472IEMIMPL_FPU_R80 fcos
3473
3474
3475;;
3476; FPU instruction working on one 80-bit floating point value, only
3477; returning FSW.
3478;
3479; @param 1 The instruction
3480; @param 2 Non-zero to also restore FTW.
3481;
3482; @param A0 FPU context (fxsave).
3483; @param A1 Pointer to a uint16_t for the resulting FSW.
3484; @param A2 Pointer to the 80-bit value.
3485;
3486%macro IEMIMPL_FPU_R80_FSW 2
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3488 PROLOGUE_3_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 fld tword [A2]
3493%if %2 != 0
3494 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3495%else
3496 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3497%endif
3498 %1
3499
3500 fnstsw word [A1]
3501
3502 fninit
3503 add xSP, 20h
3504 EPILOGUE_3_ARGS
3505ENDPROC iemAImpl_ %+ %1 %+ _r80
3506%endmacro
3507
3508IEMIMPL_FPU_R80_FSW ftst, 0
3509IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3510
3511
3512
3513;;
3514; FPU instruction loading a 80-bit floating point constant.
3515;
3516; @param 1 The instruction
3517;
3518; @param A0 FPU context (fxsave).
3519; @param A1 Pointer to a IEMFPURESULT for the output.
3520;
3521%macro IEMIMPL_FPU_R80_CONST 1
3522BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3523 PROLOGUE_2_ARGS
3524 sub xSP, 20h
3525
3526 fninit
3527 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3528 %1
3529
3530 fnstsw word [A1 + IEMFPURESULT.FSW]
3531 fnclex
3532 fstp tword [A1 + IEMFPURESULT.r80Result]
3533
3534 fninit
3535 add xSP, 20h
3536 EPILOGUE_2_ARGS
3537ENDPROC iemAImpl_ %+ %1 %+
3538%endmacro
3539
3540IEMIMPL_FPU_R80_CONST fld1
3541IEMIMPL_FPU_R80_CONST fldl2t
3542IEMIMPL_FPU_R80_CONST fldl2e
3543IEMIMPL_FPU_R80_CONST fldpi
3544IEMIMPL_FPU_R80_CONST fldlg2
3545IEMIMPL_FPU_R80_CONST fldln2
3546IEMIMPL_FPU_R80_CONST fldz
3547
3548
3549;;
3550; FPU instruction working on one 80-bit floating point value, outputing two.
3551;
3552; @param 1 The instruction
3553;
3554; @param A0 FPU context (fxsave).
3555; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3556; @param A2 Pointer to the 80-bit value.
3557;
3558%macro IEMIMPL_FPU_R80_R80 1
3559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3560 PROLOGUE_3_ARGS
3561 sub xSP, 20h
3562
3563 fninit
3564 fld tword [A2]
3565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3566 %1
3567
3568 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3569 fnclex
3570 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3571 fnclex
3572 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3573
3574 fninit
3575 add xSP, 20h
3576 EPILOGUE_3_ARGS
3577ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3578%endmacro
3579
3580IEMIMPL_FPU_R80_R80 fptan
3581IEMIMPL_FPU_R80_R80 fxtract
3582IEMIMPL_FPU_R80_R80 fsincos
3583
3584
3585
3586
3587;---------------------- SSE and MMX Operations ----------------------
3588
3589;; @todo what do we need to do for MMX?
3590%macro IEMIMPL_MMX_PROLOGUE 0
3591%endmacro
3592%macro IEMIMPL_MMX_EPILOGUE 0
3593%endmacro
3594
3595;; @todo what do we need to do for SSE?
3596%macro IEMIMPL_SSE_PROLOGUE 0
3597%endmacro
3598%macro IEMIMPL_SSE_EPILOGUE 0
3599%endmacro
3600
3601;; @todo what do we need to do for AVX?
3602%macro IEMIMPL_AVX_PROLOGUE 0
3603%endmacro
3604%macro IEMIMPL_AVX_EPILOGUE 0
3605%endmacro
3606
3607
3608;;
3609; Media instruction working on two full sized registers.
3610;
3611; @param 1 The instruction
3612; @param 2 Whether there is an MMX variant (1) or not (0).
3613;
3614; @param A0 FPU context (fxsave).
3615; @param A1 Pointer to the first media register size operand (input/output).
3616; @param A2 Pointer to the second media register size operand (input).
3617;
3618%macro IEMIMPL_MEDIA_F2 2
3619%if %2 != 0
3620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3621 PROLOGUE_3_ARGS
3622 IEMIMPL_MMX_PROLOGUE
3623
3624 movq mm0, [A1]
3625 movq mm1, [A2]
3626 %1 mm0, mm1
3627 movq [A1], mm0
3628
3629 IEMIMPL_MMX_EPILOGUE
3630 EPILOGUE_3_ARGS
3631ENDPROC iemAImpl_ %+ %1 %+ _u64
3632%endif
3633
3634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3635 PROLOGUE_3_ARGS
3636 IEMIMPL_SSE_PROLOGUE
3637
3638 movdqu xmm0, [A1]
3639 movdqu xmm1, [A2]
3640 %1 xmm0, xmm1
3641 movdqu [A1], xmm0
3642
3643 IEMIMPL_SSE_EPILOGUE
3644 EPILOGUE_3_ARGS
3645ENDPROC iemAImpl_ %+ %1 %+ _u128
3646%endmacro
3647
3648IEMIMPL_MEDIA_F2 pshufb, 1
3649IEMIMPL_MEDIA_F2 pand, 1
3650IEMIMPL_MEDIA_F2 pandn, 1
3651IEMIMPL_MEDIA_F2 por, 1
3652IEMIMPL_MEDIA_F2 pxor, 1
3653IEMIMPL_MEDIA_F2 pcmpeqb, 1
3654IEMIMPL_MEDIA_F2 pcmpeqw, 1
3655IEMIMPL_MEDIA_F2 pcmpeqd, 1
3656IEMIMPL_MEDIA_F2 pcmpeqq, 0
3657IEMIMPL_MEDIA_F2 pcmpgtb, 1
3658IEMIMPL_MEDIA_F2 pcmpgtw, 1
3659IEMIMPL_MEDIA_F2 pcmpgtd, 1
3660IEMIMPL_MEDIA_F2 pcmpgtq, 0
3661IEMIMPL_MEDIA_F2 paddb, 1
3662IEMIMPL_MEDIA_F2 paddw, 1
3663IEMIMPL_MEDIA_F2 paddd, 1
3664IEMIMPL_MEDIA_F2 paddq, 1
3665IEMIMPL_MEDIA_F2 paddsb, 1
3666IEMIMPL_MEDIA_F2 paddsw, 1
3667IEMIMPL_MEDIA_F2 paddusb, 1
3668IEMIMPL_MEDIA_F2 paddusw, 1
3669IEMIMPL_MEDIA_F2 psubb, 1
3670IEMIMPL_MEDIA_F2 psubw, 1
3671IEMIMPL_MEDIA_F2 psubd, 1
3672IEMIMPL_MEDIA_F2 psubq, 1
3673IEMIMPL_MEDIA_F2 psubsb, 1
3674IEMIMPL_MEDIA_F2 psubsw, 1
3675IEMIMPL_MEDIA_F2 psubusb, 1
3676IEMIMPL_MEDIA_F2 psubusw, 1
3677IEMIMPL_MEDIA_F2 pmullw, 1
3678IEMIMPL_MEDIA_F2 pmulld, 0
3679IEMIMPL_MEDIA_F2 pmulhw, 1
3680IEMIMPL_MEDIA_F2 pmaddwd, 1
3681IEMIMPL_MEDIA_F2 pminub, 1
3682IEMIMPL_MEDIA_F2 pminuw, 0
3683IEMIMPL_MEDIA_F2 pminud, 0
3684IEMIMPL_MEDIA_F2 pminsb, 0
3685IEMIMPL_MEDIA_F2 pminsw, 1
3686IEMIMPL_MEDIA_F2 pminsd, 0
3687IEMIMPL_MEDIA_F2 pmaxub, 1
3688IEMIMPL_MEDIA_F2 pmaxuw, 0
3689IEMIMPL_MEDIA_F2 pmaxud, 0
3690IEMIMPL_MEDIA_F2 pmaxsb, 0
3691IEMIMPL_MEDIA_F2 pmaxsw, 1
3692IEMIMPL_MEDIA_F2 pmaxsd, 0
3693IEMIMPL_MEDIA_F2 pabsb, 1
3694IEMIMPL_MEDIA_F2 pabsw, 1
3695IEMIMPL_MEDIA_F2 pabsd, 1
3696IEMIMPL_MEDIA_F2 psignb, 1
3697IEMIMPL_MEDIA_F2 psignw, 1
3698IEMIMPL_MEDIA_F2 psignd, 1
3699IEMIMPL_MEDIA_F2 phaddw, 1
3700IEMIMPL_MEDIA_F2 phaddd, 1
3701IEMIMPL_MEDIA_F2 phsubw, 1
3702IEMIMPL_MEDIA_F2 phsubd, 1
3703IEMIMPL_MEDIA_F2 phaddsw, 1
3704IEMIMPL_MEDIA_F2 phsubsw, 1
3705IEMIMPL_MEDIA_F2 pmaddubsw, 1
3706IEMIMPL_MEDIA_F2 pmulhrsw, 1
3707IEMIMPL_MEDIA_F2 pmuludq, 1
3708
3709
3710;;
3711; Media instruction working on two full sized registers, but no FXSAVE state argument.
3712;
3713; @param 1 The instruction
3714; @param 2 Whether there is an MMX variant (1) or not (0).
3715;
3716; @param A0 Pointer to the first media register size operand (input/output).
3717; @param A1 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_OPT_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3722 PROLOGUE_2_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A0]
3726 movq mm1, [A1]
3727 %1 mm0, mm1
3728 movq [A0], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_2_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3736 PROLOGUE_2_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A0]
3740 movdqu xmm1, [A1]
3741 %1 xmm0, xmm1
3742 movdqu [A0], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_2_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3750IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3751IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3752IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3753IEMIMPL_MEDIA_OPT_F2 psllw, 1
3754IEMIMPL_MEDIA_OPT_F2 pslld, 1
3755IEMIMPL_MEDIA_OPT_F2 psllq, 1
3756IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3757IEMIMPL_MEDIA_OPT_F2 psrld, 1
3758IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3759IEMIMPL_MEDIA_OPT_F2 psraw, 1
3760IEMIMPL_MEDIA_OPT_F2 psrad, 1
3761IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3762IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3763IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3764IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3765IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3766IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3767IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3768IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3769IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3770IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3771IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3772IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3773IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3774IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3775IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3776IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3777IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3778IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3779IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3780IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3781
3782;;
3783; Media instruction working on one full sized and one half sized register (lower half).
3784;
3785; @param 1 The instruction
3786; @param 2 1 if MMX is included, 0 if not.
3787;
3788; @param A0 Pointer to the first full sized media register operand (input/output).
3789; @param A1 Pointer to the second half sized media register operand (input).
3790;
3791%macro IEMIMPL_MEDIA_F1L1 2
3792 %if %2 != 0
3793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3794 PROLOGUE_2_ARGS
3795 IEMIMPL_MMX_PROLOGUE
3796
3797 movq mm0, [A0]
3798 movq mm1, [A1]
3799 %1 mm0, mm1
3800 movq [A0], mm0
3801
3802 IEMIMPL_MMX_EPILOGUE
3803 EPILOGUE_2_ARGS
3804ENDPROC iemAImpl_ %+ %1 %+ _u64
3805 %endif
3806
3807BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3808 PROLOGUE_2_ARGS
3809 IEMIMPL_SSE_PROLOGUE
3810
3811 movdqu xmm0, [A0]
3812 movdqu xmm1, [A1]
3813 %1 xmm0, xmm1
3814 movdqu [A0], xmm0
3815
3816 IEMIMPL_SSE_EPILOGUE
3817 EPILOGUE_2_ARGS
3818ENDPROC iemAImpl_ %+ %1 %+ _u128
3819%endmacro
3820
3821IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3822IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3823IEMIMPL_MEDIA_F1L1 punpckldq, 1
3824IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3825
3826
3827;;
3828; Media instruction working two half sized input registers (lower half) and a full sized
3829; destination register (vpunpckh*).
3830;
3831; @param 1 The instruction
3832;
3833; @param A0 Pointer to the destination register (full sized, output only).
3834; @param A1 Pointer to the first full sized media source register operand, where we
3835; will only use the lower half as input - but we'll be loading it in full.
3836; @param A2 Pointer to the second full sized media source register operand, where we
3837; will only use the lower half as input - but we'll be loading it in full.
3838;
3839%macro IEMIMPL_MEDIA_F1L1L1 1
3840BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3841 PROLOGUE_3_ARGS
3842 IEMIMPL_AVX_PROLOGUE
3843
3844 vmovdqu xmm0, [A1]
3845 vmovdqu xmm1, [A2]
3846 %1 xmm0, xmm0, xmm1
3847 vmovdqu [A0], xmm0
3848
3849 IEMIMPL_AVX_PROLOGUE
3850 EPILOGUE_3_ARGS
3851ENDPROC iemAImpl_ %+ %1 %+ _u128
3852
3853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3854 PROLOGUE_3_ARGS
3855 IEMIMPL_AVX_PROLOGUE
3856
3857 vmovdqu ymm0, [A1]
3858 vmovdqu ymm1, [A2]
3859 %1 ymm0, ymm0, ymm1
3860 vmovdqu [A0], ymm0
3861
3862 IEMIMPL_AVX_PROLOGUE
3863 EPILOGUE_3_ARGS
3864ENDPROC iemAImpl_ %+ %1 %+ _u256
3865%endmacro
3866
3867IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3868IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3869IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3870IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3871
3872
3873;;
3874; Media instruction working on one full sized and one half sized register (high half).
3875;
3876; @param 1 The instruction
3877; @param 2 1 if MMX is included, 0 if not.
3878;
3879; @param A0 Pointer to the first full sized media register operand (input/output).
3880; @param A1 Pointer to the second full sized media register operand, where we
3881; will only use the upper half as input - but we'll load it in full.
3882;
3883%macro IEMIMPL_MEDIA_F1H1 2
3884IEMIMPL_MEDIA_F1L1 %1, %2
3885%endmacro
3886
3887IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3888IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3889IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3890IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3891
3892
3893;;
3894; Media instruction working two half sized input registers (high half) and a full sized
3895; destination register (vpunpckh*).
3896;
3897; @param 1 The instruction
3898;
3899; @param A0 Pointer to the destination register (full sized, output only).
3900; @param A1 Pointer to the first full sized media source register operand, where we
3901; will only use the upper half as input - but we'll be loading it in full.
3902; @param A2 Pointer to the second full sized media source register operand, where we
3903; will only use the upper half as input - but we'll be loading it in full.
3904;
3905%macro IEMIMPL_MEDIA_F1H1H1 1
3906IEMIMPL_MEDIA_F1L1L1 %1
3907%endmacro
3908
3909IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3910IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3911IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3912IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3913
3914
3915;
3916; Shufflers with evil 8-bit immediates.
3917;
3918
3919BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3920 PROLOGUE_3_ARGS
3921 IEMIMPL_MMX_PROLOGUE
3922
3923 movq mm1, [A1]
3924 movq mm0, mm0 ; paranoia!
3925 lea T1, [.imm0 xWrtRIP]
3926 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3927 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
3928 %else
3929 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3930 %endif
3931 lea T1, [T1 + T0]
3932 IBT_NOTRACK
3933 call T1
3934 movq [A0], mm0
3935
3936 IEMIMPL_MMX_EPILOGUE
3937 EPILOGUE_3_ARGS
3938%assign bImm 0
3939%rep 256
3940.imm %+ bImm:
3941 IBT_ENDBRxx_WITHOUT_NOTRACK
3942 pshufw mm0, mm1, bImm
3943 ret
3944 %assign bImm bImm + 1
3945%endrep
3946.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
3947ENDPROC iemAImpl_pshufw_u64
3948
3949
3950%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3952 PROLOGUE_3_ARGS
3953 IEMIMPL_SSE_PROLOGUE
3954
3955 movdqu xmm1, [A1]
3956 movdqu xmm0, xmm1 ; paranoia!
3957 lea T1, [.imm0 xWrtRIP]
3958 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3959 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3960 %else
3961 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
3962 %endif
3963 lea T1, [T1 + T0*2]
3964 IBT_NOTRACK
3965 call T1
3966 movdqu [A0], xmm0
3967
3968 IEMIMPL_SSE_EPILOGUE
3969 EPILOGUE_3_ARGS
3970
3971 %assign bImm 0
3972 %rep 256
3973.imm %+ bImm:
3974 IBT_ENDBRxx_WITHOUT_NOTRACK
3975 %1 xmm0, xmm1, bImm
3976 ret
3977 %assign bImm bImm + 1
3978 %endrep
3979.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
3980ENDPROC iemAImpl_ %+ %1 %+ _u128
3981%endmacro
3982
3983IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3984IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3985IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3986
3987
3988%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3990 PROLOGUE_3_ARGS
3991 IEMIMPL_SSE_PROLOGUE
3992
3993 vmovdqu ymm1, [A1]
3994 vmovdqu ymm0, ymm1 ; paranoia!
3995 lea T1, [.imm0 xWrtRIP]
3996 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3997 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3998 %else
3999 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4000 %endif
4001 lea T1, [T1 + T0*2]
4002 IBT_NOTRACK
4003 call T1
4004 vmovdqu [A0], ymm0
4005
4006 IEMIMPL_SSE_EPILOGUE
4007 EPILOGUE_3_ARGS
4008 %assign bImm 0
4009 %rep 256
4010.imm %+ bImm:
4011 IBT_ENDBRxx_WITHOUT_NOTRACK
4012 %1 ymm0, ymm1, bImm
4013 ret
4014 %assign bImm bImm + 1
4015 %endrep
4016.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4017ENDPROC iemAImpl_ %+ %1 %+ _u256
4018%endmacro
4019
4020IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4021IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4022IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4023
4024
4025;
4026; Shifts with evil 8-bit immediates.
4027;
4028
4029%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4031 PROLOGUE_2_ARGS
4032 IEMIMPL_MMX_PROLOGUE
4033
4034 movq mm0, [A0]
4035 lea T1, [.imm0 xWrtRIP]
4036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4037 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4038 %else
4039 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4040 %endif
4041 lea T1, [T1 + T0]
4042 IBT_NOTRACK
4043 call T1
4044 movq [A0], mm0
4045
4046 IEMIMPL_MMX_EPILOGUE
4047 EPILOGUE_2_ARGS
4048%assign bImm 0
4049%rep 256
4050.imm %+ bImm:
4051 IBT_ENDBRxx_WITHOUT_NOTRACK
4052 %1 mm0, bImm
4053 ret
4054 %assign bImm bImm + 1
4055%endrep
4056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4057ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4058%endmacro
4059
4060IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4061IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4062IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4063IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4064IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4065IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4066IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4067IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4068
4069
4070%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4072 PROLOGUE_2_ARGS
4073 IEMIMPL_SSE_PROLOGUE
4074
4075 movdqu xmm0, [A0]
4076 lea T1, [.imm0 xWrtRIP]
4077 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4078 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4079 %else
4080 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4081 %endif
4082 lea T1, [T1 + T0*2]
4083 IBT_NOTRACK
4084 call T1
4085 movdqu [A0], xmm0
4086
4087 IEMIMPL_SSE_EPILOGUE
4088 EPILOGUE_2_ARGS
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4102IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4103IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4104IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4105IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4106IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4107IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4108IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4109IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4110IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4111
4112
4113;
4114; Move byte mask.
4115;
4116
4117BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4118 PROLOGUE_2_ARGS
4119 IEMIMPL_MMX_PROLOGUE
4120
4121 movq mm1, [A1]
4122 pmovmskb T0, mm1
4123 mov [A0], T0
4124%ifdef RT_ARCH_X86
4125 mov dword [A0 + 4], 0
4126%endif
4127 IEMIMPL_MMX_EPILOGUE
4128 EPILOGUE_2_ARGS
4129ENDPROC iemAImpl_pmovmskb_u64
4130
4131BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4132 PROLOGUE_2_ARGS
4133 IEMIMPL_SSE_PROLOGUE
4134
4135 movdqu xmm1, [A1]
4136 pmovmskb T0, xmm1
4137 mov [A0], T0
4138%ifdef RT_ARCH_X86
4139 mov dword [A0 + 4], 0
4140%endif
4141 IEMIMPL_SSE_EPILOGUE
4142 EPILOGUE_2_ARGS
4143ENDPROC iemAImpl_pmovmskb_u128
4144
4145BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4146 PROLOGUE_2_ARGS
4147 IEMIMPL_AVX_PROLOGUE
4148
4149 vmovdqu ymm1, [A1]
4150 vpmovmskb T0, ymm1
4151 mov [A0], T0
4152%ifdef RT_ARCH_X86
4153 mov dword [A0 + 4], 0
4154%endif
4155 IEMIMPL_AVX_EPILOGUE
4156 EPILOGUE_2_ARGS
4157ENDPROC iemAImpl_vpmovmskb_u256
4158
4159
4160;;
4161; Media instruction working on two full sized source registers and one destination (AVX).
4162;
4163; @param 1 The instruction
4164;
4165; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4166; @param A1 Pointer to the destination media register size operand (output).
4167; @param A2 Pointer to the first source media register size operand (input).
4168; @param A3 Pointer to the second source media register size operand (input).
4169;
4170%macro IEMIMPL_MEDIA_F3 1
4171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4172 PROLOGUE_4_ARGS
4173 IEMIMPL_AVX_PROLOGUE
4174
4175 vmovdqu xmm0, [A2]
4176 vmovdqu xmm1, [A3]
4177 %1 xmm0, xmm0, xmm1
4178 vmovdqu [A1], xmm0
4179
4180 IEMIMPL_AVX_PROLOGUE
4181 EPILOGUE_4_ARGS
4182ENDPROC iemAImpl_ %+ %1 %+ _u128
4183
4184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4185 PROLOGUE_4_ARGS
4186 IEMIMPL_AVX_PROLOGUE
4187
4188 vmovdqu ymm0, [A2]
4189 vmovdqu ymm1, [A3]
4190 %1 ymm0, ymm0, ymm1
4191 vmovdqu [A1], ymm0
4192
4193 IEMIMPL_AVX_PROLOGUE
4194 EPILOGUE_4_ARGS
4195ENDPROC iemAImpl_ %+ %1 %+ _u256
4196%endmacro
4197
4198IEMIMPL_MEDIA_F3 vpshufb
4199IEMIMPL_MEDIA_F3 vpand
4200IEMIMPL_MEDIA_F3 vpminub
4201IEMIMPL_MEDIA_F3 vpminuw
4202IEMIMPL_MEDIA_F3 vpminud
4203IEMIMPL_MEDIA_F3 vpminsb
4204IEMIMPL_MEDIA_F3 vpminsw
4205IEMIMPL_MEDIA_F3 vpminsd
4206IEMIMPL_MEDIA_F3 vpmaxub
4207IEMIMPL_MEDIA_F3 vpmaxuw
4208IEMIMPL_MEDIA_F3 vpmaxud
4209IEMIMPL_MEDIA_F3 vpmaxsb
4210IEMIMPL_MEDIA_F3 vpmaxsw
4211IEMIMPL_MEDIA_F3 vpmaxsd
4212IEMIMPL_MEDIA_F3 vpandn
4213IEMIMPL_MEDIA_F3 vpor
4214IEMIMPL_MEDIA_F3 vpxor
4215IEMIMPL_MEDIA_F3 vpcmpeqb
4216IEMIMPL_MEDIA_F3 vpcmpeqw
4217IEMIMPL_MEDIA_F3 vpcmpeqd
4218IEMIMPL_MEDIA_F3 vpcmpeqq
4219IEMIMPL_MEDIA_F3 vpcmpgtb
4220IEMIMPL_MEDIA_F3 vpcmpgtw
4221IEMIMPL_MEDIA_F3 vpcmpgtd
4222IEMIMPL_MEDIA_F3 vpcmpgtq
4223IEMIMPL_MEDIA_F3 vpaddb
4224IEMIMPL_MEDIA_F3 vpaddw
4225IEMIMPL_MEDIA_F3 vpaddd
4226IEMIMPL_MEDIA_F3 vpaddq
4227IEMIMPL_MEDIA_F3 vpsubb
4228IEMIMPL_MEDIA_F3 vpsubw
4229IEMIMPL_MEDIA_F3 vpsubd
4230IEMIMPL_MEDIA_F3 vpsubq
4231
4232
4233;;
4234; Media instruction working on two full sized source registers and one destination (AVX),
4235; but no XSAVE state pointer argument.
4236;
4237; @param 1 The instruction
4238;
4239; @param A0 Pointer to the destination media register size operand (output).
4240; @param A1 Pointer to the first source media register size operand (input).
4241; @param A2 Pointer to the second source media register size operand (input).
4242;
4243%macro IEMIMPL_MEDIA_OPT_F3 1
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_AVX_PROLOGUE
4247
4248 vmovdqu xmm0, [A1]
4249 vmovdqu xmm1, [A2]
4250 %1 xmm0, xmm0, xmm1
4251 vmovdqu [A0], xmm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256
4257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4258 PROLOGUE_3_ARGS
4259 IEMIMPL_AVX_PROLOGUE
4260
4261 vmovdqu ymm0, [A1]
4262 vmovdqu ymm1, [A2]
4263 %1 ymm0, ymm0, ymm1
4264 vmovdqu [A0], ymm0
4265
4266 IEMIMPL_AVX_PROLOGUE
4267 EPILOGUE_3_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269%endmacro
4270
4271IEMIMPL_MEDIA_OPT_F3 vpacksswb
4272IEMIMPL_MEDIA_OPT_F3 vpackssdw
4273IEMIMPL_MEDIA_OPT_F3 vpackuswb
4274IEMIMPL_MEDIA_OPT_F3 vpackusdw
4275IEMIMPL_MEDIA_OPT_F3 vpmullw
4276IEMIMPL_MEDIA_OPT_F3 vpmulld
4277IEMIMPL_MEDIA_OPT_F3 vpmulhw
4278IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4279IEMIMPL_MEDIA_OPT_F3 vpavgb
4280IEMIMPL_MEDIA_OPT_F3 vpavgw
4281IEMIMPL_MEDIA_OPT_F3 vpsignb
4282IEMIMPL_MEDIA_OPT_F3 vpsignw
4283IEMIMPL_MEDIA_OPT_F3 vpsignd
4284IEMIMPL_MEDIA_OPT_F3 vphaddw
4285IEMIMPL_MEDIA_OPT_F3 vphaddd
4286IEMIMPL_MEDIA_OPT_F3 vphsubw
4287IEMIMPL_MEDIA_OPT_F3 vphsubd
4288IEMIMPL_MEDIA_OPT_F3 vphaddsw
4289IEMIMPL_MEDIA_OPT_F3 vphsubsw
4290IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4291IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4292IEMIMPL_MEDIA_OPT_F3 vpsadbw
4293IEMIMPL_MEDIA_OPT_F3 vpmuldq
4294IEMIMPL_MEDIA_OPT_F3 vpmuludq
4295IEMIMPL_MEDIA_OPT_F3 vunpcklps
4296IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4297IEMIMPL_MEDIA_OPT_F3 vunpckhps
4298IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4299IEMIMPL_MEDIA_OPT_F3 vpsubsb
4300IEMIMPL_MEDIA_OPT_F3 vpsubsw
4301
4302
4303;;
4304; Media instruction working on one full sized source registers and one destination (AVX),
4305; but no XSAVE state pointer argument.
4306;
4307; @param 1 The instruction
4308; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4309;
4310; @param A0 Pointer to the destination media register size operand (output).
4311; @param A1 Pointer to the source media register size operand (input).
4312;
4313%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4314BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4315 PROLOGUE_2_ARGS
4316 IEMIMPL_AVX_PROLOGUE
4317
4318 vmovdqu xmm0, [A1]
4319 %1 xmm0, xmm0
4320 vmovdqu [A0], xmm0
4321
4322 IEMIMPL_AVX_PROLOGUE
4323 EPILOGUE_2_ARGS
4324ENDPROC iemAImpl_ %+ %1 %+ _u128
4325
4326 %if %2 == 1
4327BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4328 PROLOGUE_2_ARGS
4329 IEMIMPL_AVX_PROLOGUE
4330
4331 vmovdqu ymm0, [A1]
4332 %1 ymm0, ymm0
4333 vmovdqu [A0], ymm0
4334
4335 IEMIMPL_AVX_PROLOGUE
4336 EPILOGUE_2_ARGS
4337ENDPROC iemAImpl_ %+ %1 %+ _u256
4338 %endif
4339%endmacro
4340
4341IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4342IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4343IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4344IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4345
4346
4347;
4348; The SSE 4.2 crc32
4349;
4350; @param A1 Pointer to the 32-bit destination.
4351; @param A2 The source operand, sized according to the suffix.
4352;
4353BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4354 PROLOGUE_2_ARGS
4355
4356 mov T0_32, [A0]
4357 crc32 T0_32, A1_8
4358 mov [A0], T0_32
4359
4360 EPILOGUE_2_ARGS
4361ENDPROC iemAImpl_crc32_u8
4362
4363BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4364 PROLOGUE_2_ARGS
4365
4366 mov T0_32, [A0]
4367 crc32 T0_32, A1_16
4368 mov [A0], T0_32
4369
4370 EPILOGUE_2_ARGS
4371ENDPROC iemAImpl_crc32_u16
4372
4373BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4374 PROLOGUE_2_ARGS
4375
4376 mov T0_32, [A0]
4377 crc32 T0_32, A1_32
4378 mov [A0], T0_32
4379
4380 EPILOGUE_2_ARGS
4381ENDPROC iemAImpl_crc32_u32
4382
4383%ifdef RT_ARCH_AMD64
4384BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4385 PROLOGUE_2_ARGS
4386
4387 mov T0_32, [A0]
4388 crc32 T0, A1
4389 mov [A0], T0_32
4390
4391 EPILOGUE_2_ARGS
4392ENDPROC iemAImpl_crc32_u64
4393%endif
4394
4395
4396;
4397; PTEST (SSE 4.1)
4398;
4399; @param A0 Pointer to the first source operand (aka readonly destination).
4400; @param A1 Pointer to the second source operand.
4401; @param A2 Pointer to the EFLAGS register.
4402;
4403BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4404 PROLOGUE_3_ARGS
4405 IEMIMPL_SSE_PROLOGUE
4406
4407 movdqu xmm0, [A0]
4408 movdqu xmm1, [A1]
4409 ptest xmm0, xmm1
4410 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4411
4412 IEMIMPL_SSE_EPILOGUE
4413 EPILOGUE_3_ARGS
4414ENDPROC iemAImpl_ptest_u128
4415
4416BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4417 PROLOGUE_3_ARGS
4418 IEMIMPL_SSE_PROLOGUE
4419
4420 vmovdqu ymm0, [A0]
4421 vmovdqu ymm1, [A1]
4422 vptest ymm0, ymm1
4423 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4424
4425 IEMIMPL_SSE_EPILOGUE
4426 EPILOGUE_3_ARGS
4427ENDPROC iemAImpl_vptest_u256
4428
4429
4430;;
4431; Template for the [v]pmov{s,z}x* instructions
4432;
4433; @param 1 The instruction
4434;
4435; @param A0 Pointer to the destination media register size operand (output).
4436; @param A1 The source operand value (input).
4437;
4438%macro IEMIMPL_V_PMOV_SZ_X 1
4439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4440 PROLOGUE_2_ARGS
4441 IEMIMPL_SSE_PROLOGUE
4442
4443 movd xmm0, A1
4444 %1 xmm0, xmm0
4445 vmovdqu [A0], xmm0
4446
4447 IEMIMPL_SSE_PROLOGUE
4448 EPILOGUE_2_ARGS
4449ENDPROC iemAImpl_ %+ %1 %+ _u128
4450
4451BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4452 PROLOGUE_2_ARGS
4453 IEMIMPL_AVX_PROLOGUE
4454
4455 movd xmm0, A1
4456 v %+ %1 xmm0, xmm0
4457 vmovdqu [A0], xmm0
4458
4459 IEMIMPL_AVX_PROLOGUE
4460 EPILOGUE_2_ARGS
4461ENDPROC iemAImpl_v %+ %1 %+ _u128
4462
4463BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4464 PROLOGUE_2_ARGS
4465 IEMIMPL_AVX_PROLOGUE
4466
4467 movdqu xmm0, [A1]
4468 v %+ %1 ymm0, xmm0
4469 vmovdqu [A0], ymm0
4470
4471 IEMIMPL_AVX_PROLOGUE
4472 EPILOGUE_2_ARGS
4473ENDPROC iemAImpl_v %+ %1 %+ _u256
4474%endmacro
4475
4476IEMIMPL_V_PMOV_SZ_X pmovsxbw
4477IEMIMPL_V_PMOV_SZ_X pmovsxbd
4478IEMIMPL_V_PMOV_SZ_X pmovsxbq
4479IEMIMPL_V_PMOV_SZ_X pmovsxwd
4480IEMIMPL_V_PMOV_SZ_X pmovsxwq
4481IEMIMPL_V_PMOV_SZ_X pmovsxdq
4482
4483IEMIMPL_V_PMOV_SZ_X pmovzxbw
4484IEMIMPL_V_PMOV_SZ_X pmovzxbd
4485IEMIMPL_V_PMOV_SZ_X pmovzxbq
4486IEMIMPL_V_PMOV_SZ_X pmovzxwd
4487IEMIMPL_V_PMOV_SZ_X pmovzxwq
4488IEMIMPL_V_PMOV_SZ_X pmovzxdq
4489
4490
4491;;
4492; Need to move this as well somewhere better?
4493;
4494struc IEMSSERESULT
4495 .uResult resd 4
4496 .MXCSR resd 1
4497endstruc
4498
4499
4500;;
4501; Need to move this as well somewhere better?
4502;
4503struc IEMAVX128RESULT
4504 .uResult resd 4
4505 .MXCSR resd 1
4506endstruc
4507
4508
4509;;
4510; Need to move this as well somewhere better?
4511;
4512struc IEMAVX256RESULT
4513 .uResult resd 8
4514 .MXCSR resd 1
4515endstruc
4516
4517
4518;;
4519; Initialize the SSE MXCSR register using the guest value partially to
4520; account for rounding mode.
4521;
4522; @uses 4 bytes of stack to save the original value, T0.
4523; @param 1 Expression giving the address of the FXSTATE of the guest.
4524;
4525%macro SSE_LD_FXSTATE_MXCSR 1
4526 sub xSP, 4
4527
4528 stmxcsr [xSP]
4529 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4530 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4531 or T0_32, X86_MXCSR_XCPT_MASK
4532 sub xSP, 4
4533 mov [xSP], T0_32
4534 ldmxcsr [xSP]
4535 add xSP, 4
4536%endmacro
4537
4538
4539;;
4540; Restores the SSE MXCSR register with the original value.
4541;
4542; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4543; @param 1 Expression giving the address where to return the MXCSR value.
4544; @param 2 Expression giving the address of the FXSTATE of the guest.
4545;
4546; @note Restores the stack pointer.
4547;
4548%macro SSE_ST_FXSTATE_MXCSR 2
4549 sub xSP, 4
4550 stmxcsr [xSP]
4551 mov T0_32, [xSP]
4552 add xSP, 4
4553 ; Merge the status bits into the original MXCSR value.
4554 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4555 and T0_32, X86_MXCSR_XCPT_FLAGS
4556 or T0_32, T1_32
4557 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4558
4559 ldmxcsr [xSP]
4560 add xSP, 4
4561%endmacro
4562
4563
4564;;
4565; Initialize the SSE MXCSR register using the guest value partially to
4566; account for rounding mode.
4567;
4568; @uses 4 bytes of stack to save the original value.
4569; @param 1 Expression giving the address of the FXSTATE of the guest.
4570;
4571%macro AVX_LD_XSAVEAREA_MXCSR 1
4572 sub xSP, 4
4573
4574 stmxcsr [xSP]
4575 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4576 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4577 sub xSP, 4
4578 mov [xSP], T0_32
4579 ldmxcsr [xSP]
4580 add xSP, 4
4581%endmacro
4582
4583
4584;;
4585; Restores the AVX128 MXCSR register with the original value.
4586;
4587; @param 1 Expression giving the address where to return the MXCSR value.
4588;
4589; @note Restores the stack pointer.
4590;
4591%macro AVX128_ST_XSAVEAREA_MXCSR 1
4592 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4593
4594 ldmxcsr [xSP]
4595 add xSP, 4
4596%endmacro
4597
4598
4599;;
4600; Restores the AVX256 MXCSR register with the original value.
4601;
4602; @param 1 Expression giving the address where to return the MXCSR value.
4603;
4604; @note Restores the stack pointer.
4605;
4606%macro AVX256_ST_XSAVEAREA_MXCSR 1
4607 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4608
4609 ldmxcsr [xSP]
4610 add xSP, 4
4611%endmacro
4612
4613
4614;;
4615; Floating point instruction working on two full sized registers.
4616;
4617; @param 1 The instruction
4618; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4619;
4620; @param A0 FPU context (FXSTATE or XSAVEAREA).
4621; @param A1 Where to return the result including the MXCSR value.
4622; @param A2 Pointer to the first media register size operand (input/output).
4623; @param A3 Pointer to the second media register size operand (input).
4624;
4625%macro IEMIMPL_FP_F2 2
4626BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4627 PROLOGUE_4_ARGS
4628 IEMIMPL_SSE_PROLOGUE
4629 SSE_LD_FXSTATE_MXCSR A0
4630
4631 movdqu xmm0, [A2]
4632 movdqu xmm1, [A3]
4633 %1 xmm0, xmm1
4634 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4635
4636 SSE_ST_FXSTATE_MXCSR A1, A0
4637 IEMIMPL_SSE_PROLOGUE
4638 EPILOGUE_4_ARGS
4639ENDPROC iemAImpl_ %+ %1 %+ _u128
4640
4641 %if %2 == 3
4642BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4643 PROLOGUE_4_ARGS
4644 IEMIMPL_AVX_PROLOGUE
4645 AVX_LD_XSAVEAREA_MXCSR A0
4646
4647 vmovdqu xmm0, [A2]
4648 vmovdqu xmm1, [A3]
4649 v %+ %1 xmm0, xmm0, xmm1
4650 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4651
4652 AVX128_ST_XSAVEAREA_MXCSR A1
4653 IEMIMPL_AVX_PROLOGUE
4654 EPILOGUE_4_ARGS
4655ENDPROC iemAImpl_v %+ %1 %+ _u128
4656
4657BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4658 PROLOGUE_4_ARGS
4659 IEMIMPL_AVX_PROLOGUE
4660 AVX_LD_XSAVEAREA_MXCSR A0
4661
4662 vmovdqu ymm0, [A2]
4663 vmovdqu ymm1, [A3]
4664 v %+ %1 ymm0, ymm0, ymm1
4665 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4666
4667 AVX256_ST_XSAVEAREA_MXCSR A1
4668 IEMIMPL_AVX_PROLOGUE
4669 EPILOGUE_4_ARGS
4670ENDPROC iemAImpl_v %+ %1 %+ _u256
4671 %elif %2 == 2
4672BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4673 PROLOGUE_4_ARGS
4674 IEMIMPL_AVX_PROLOGUE
4675 AVX_LD_XSAVEAREA_MXCSR A0
4676
4677 vmovdqu xmm0, [A2]
4678 vmovdqu xmm1, [A3]
4679 v %+ %1 xmm0, xmm1
4680 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4681
4682 AVX128_ST_XSAVEAREA_MXCSR A1
4683 IEMIMPL_AVX_PROLOGUE
4684 EPILOGUE_4_ARGS
4685ENDPROC iemAImpl_v %+ %1 %+ _u128
4686
4687BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4688 PROLOGUE_4_ARGS
4689 IEMIMPL_AVX_PROLOGUE
4690 AVX_LD_XSAVEAREA_MXCSR A0
4691
4692 vmovdqu ymm0, [A2]
4693 vmovdqu ymm1, [A3]
4694 v %+ %1 ymm0, ymm1
4695 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4696
4697 AVX256_ST_XSAVEAREA_MXCSR A1
4698 IEMIMPL_AVX_PROLOGUE
4699 EPILOGUE_4_ARGS
4700ENDPROC iemAImpl_v %+ %1 %+ _u256
4701 %endif
4702%endmacro
4703
4704IEMIMPL_FP_F2 addps, 3
4705IEMIMPL_FP_F2 addpd, 3
4706IEMIMPL_FP_F2 mulps, 3
4707IEMIMPL_FP_F2 mulpd, 3
4708IEMIMPL_FP_F2 subps, 3
4709IEMIMPL_FP_F2 subpd, 3
4710IEMIMPL_FP_F2 minps, 3
4711IEMIMPL_FP_F2 minpd, 3
4712IEMIMPL_FP_F2 divps, 3
4713IEMIMPL_FP_F2 divpd, 3
4714IEMIMPL_FP_F2 maxps, 3
4715IEMIMPL_FP_F2 maxpd, 3
4716IEMIMPL_FP_F2 haddps, 3
4717IEMIMPL_FP_F2 haddpd, 3
4718IEMIMPL_FP_F2 hsubps, 3
4719IEMIMPL_FP_F2 hsubpd, 3
4720IEMIMPL_FP_F2 addsubps, 3
4721IEMIMPL_FP_F2 addsubpd, 3
4722
4723
4724;;
4725; These are actually unary operations but to keep it simple
4726; we treat them as binary for now, so the output result is
4727; always in sync with the register where the result might get written
4728; to.
4729IEMIMPL_FP_F2 sqrtps, 2
4730IEMIMPL_FP_F2 rsqrtps, 2
4731IEMIMPL_FP_F2 sqrtpd, 2
4732IEMIMPL_FP_F2 cvtdq2ps, 2
4733IEMIMPL_FP_F2 cvtps2dq, 2
4734IEMIMPL_FP_F2 cvttps2dq, 2
4735IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4736IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4737IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4738
4739
4740;;
4741; Floating point instruction working on a full sized register and a single precision operand.
4742;
4743; @param 1 The instruction
4744;
4745; @param A0 FPU context (FXSTATE or XSAVEAREA).
4746; @param A1 Where to return the result including the MXCSR value.
4747; @param A2 Pointer to the first media register size operand (input/output).
4748; @param A3 Pointer to the second single precision floating point value (input).
4749;
4750%macro IEMIMPL_FP_F2_R32 1
4751BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4752 PROLOGUE_4_ARGS
4753 IEMIMPL_SSE_PROLOGUE
4754 SSE_LD_FXSTATE_MXCSR A0
4755
4756 movdqu xmm0, [A2]
4757 movd xmm1, [A3]
4758 %1 xmm0, xmm1
4759 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4760
4761 SSE_ST_FXSTATE_MXCSR A1, A0
4762 IEMIMPL_SSE_EPILOGUE
4763 EPILOGUE_4_ARGS
4764ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4765
4766BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4767 PROLOGUE_4_ARGS
4768 IEMIMPL_AVX_PROLOGUE
4769 AVX_LD_XSAVEAREA_MXCSR A0
4770
4771 vmovdqu xmm0, [A2]
4772 vmovd xmm1, [A3]
4773 v %+ %1 xmm0, xmm0, xmm1
4774 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4775
4776 AVX128_ST_XSAVEAREA_MXCSR A1
4777 IEMIMPL_AVX_PROLOGUE
4778 EPILOGUE_4_ARGS
4779ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4780%endmacro
4781
4782IEMIMPL_FP_F2_R32 addss
4783IEMIMPL_FP_F2_R32 mulss
4784IEMIMPL_FP_F2_R32 subss
4785IEMIMPL_FP_F2_R32 minss
4786IEMIMPL_FP_F2_R32 divss
4787IEMIMPL_FP_F2_R32 maxss
4788IEMIMPL_FP_F2_R32 cvtss2sd
4789IEMIMPL_FP_F2_R32 sqrtss
4790IEMIMPL_FP_F2_R32 rsqrtss
4791
4792
4793;;
4794; Floating point instruction working on a full sized register and a double precision operand.
4795;
4796; @param 1 The instruction
4797;
4798; @param A0 FPU context (FXSTATE or XSAVEAREA).
4799; @param A1 Where to return the result including the MXCSR value.
4800; @param A2 Pointer to the first media register size operand (input/output).
4801; @param A3 Pointer to the second double precision floating point value (input).
4802;
4803%macro IEMIMPL_FP_F2_R64 1
4804BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4805 PROLOGUE_4_ARGS
4806 IEMIMPL_SSE_PROLOGUE
4807 SSE_LD_FXSTATE_MXCSR A0
4808
4809 movdqu xmm0, [A2]
4810 movq xmm1, [A3]
4811 %1 xmm0, xmm1
4812 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4813
4814 SSE_ST_FXSTATE_MXCSR A1, A0
4815 IEMIMPL_SSE_EPILOGUE
4816 EPILOGUE_4_ARGS
4817ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4818
4819BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4820 PROLOGUE_4_ARGS
4821 IEMIMPL_AVX_PROLOGUE
4822 AVX_LD_XSAVEAREA_MXCSR A0
4823
4824 vmovdqu xmm0, [A2]
4825 vmovq xmm1, [A3]
4826 v %+ %1 xmm0, xmm0, xmm1
4827 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4828
4829 AVX128_ST_XSAVEAREA_MXCSR A1
4830 IEMIMPL_AVX_EPILOGUE
4831 EPILOGUE_4_ARGS
4832ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4833%endmacro
4834
4835IEMIMPL_FP_F2_R64 addsd
4836IEMIMPL_FP_F2_R64 mulsd
4837IEMIMPL_FP_F2_R64 subsd
4838IEMIMPL_FP_F2_R64 minsd
4839IEMIMPL_FP_F2_R64 divsd
4840IEMIMPL_FP_F2_R64 maxsd
4841IEMIMPL_FP_F2_R64 cvtsd2ss
4842IEMIMPL_FP_F2_R64 sqrtsd
4843
4844
4845;;
4846; Macro for the cvtpd2ps/cvtps2pd instructions.
4847;
4848; 1 The instruction name.
4849; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4850;
4851; @param A0 FPU context (FXSTATE or XSAVEAREA).
4852; @param A1 Where to return the result including the MXCSR value.
4853; @param A2 Pointer to the first media register size operand (input/output).
4854; @param A3 Pointer to the second media register size operand (input).
4855;
4856%macro IEMIMPL_CVT_F2 2
4857BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4858 PROLOGUE_4_ARGS
4859 IEMIMPL_SSE_PROLOGUE
4860 SSE_LD_FXSTATE_MXCSR A0
4861
4862 movdqu xmm0, [A2]
4863 movdqu xmm1, [A3]
4864 %1 xmm0, xmm1
4865 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4866
4867 SSE_ST_FXSTATE_MXCSR A1, A0
4868 IEMIMPL_SSE_EPILOGUE
4869 EPILOGUE_4_ARGS
4870ENDPROC iemAImpl_ %+ %1 %+ _u128
4871
4872BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4873 PROLOGUE_4_ARGS
4874 IEMIMPL_AVX_PROLOGUE
4875 AVX_LD_XSAVEAREA_MXCSR A0
4876
4877 vmovdqu xmm0, [A2]
4878 vmovdqu xmm1, [A3]
4879 v %+ %1 xmm0, xmm1
4880 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4881
4882 AVX128_ST_XSAVEAREA_MXCSR A1
4883 IEMIMPL_AVX_EPILOGUE
4884 EPILOGUE_4_ARGS
4885ENDPROC iemAImpl_v %+ %1 %+ _u128
4886
4887BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4888 PROLOGUE_4_ARGS
4889 IEMIMPL_AVX_PROLOGUE
4890 AVX_LD_XSAVEAREA_MXCSR A0
4891
4892 vmovdqu ymm0, [A2]
4893 vmovdqu ymm1, [A3]
4894 %if %2 == 0
4895 v %+ %1 xmm0, ymm1
4896 %else
4897 v %+ %1 ymm0, xmm1
4898 %endif
4899 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4900
4901 AVX256_ST_XSAVEAREA_MXCSR A1
4902 IEMIMPL_AVX_EPILOGUE
4903 EPILOGUE_4_ARGS
4904ENDPROC iemAImpl_v %+ %1 %+ _u256
4905%endmacro
4906
4907IEMIMPL_CVT_F2 cvtpd2ps, 0
4908IEMIMPL_CVT_F2 cvtps2pd, 1
4909
4910
4911;;
4912; shufps instructions with 8-bit immediates.
4913;
4914; @param A0 Pointer to the destination media register size operand (input/output).
4915; @param A1 Pointer to the first source media register size operand (input).
4916; @param A2 The 8-bit immediate
4917;
4918BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4919 PROLOGUE_3_ARGS
4920 IEMIMPL_SSE_PROLOGUE
4921
4922 movdqu xmm0, [A0]
4923 movdqu xmm1, [A1]
4924 lea T1, [.imm0 xWrtRIP]
4925 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4926 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
4927 %else
4928 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
4929 %endif
4930 lea T1, [T1 + T0*2]
4931 IBT_NOTRACK
4932 call T1
4933 movdqu [A0], xmm0
4934
4935 IEMIMPL_SSE_EPILOGUE
4936 EPILOGUE_3_ARGS
4937 %assign bImm 0
4938 %rep 256
4939.imm %+ bImm:
4940 IBT_ENDBRxx_WITHOUT_NOTRACK
4941 shufps xmm0, xmm1, bImm
4942 ret
4943 int3
4944 %assign bImm bImm + 1
4945 %endrep
4946.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4947ENDPROC iemAImpl_shufps_u128
4948
4949
4950;;
4951; shufpd instruction with 8-bit immediates.
4952;
4953; @param A0 Pointer to the destination media register size operand (input/output).
4954; @param A1 Pointer to the first source media register size operand (input).
4955; @param A2 The 8-bit immediate
4956;
4957BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4958 PROLOGUE_3_ARGS
4959 IEMIMPL_SSE_PROLOGUE
4960
4961 movdqu xmm0, [A0]
4962 movdqu xmm1, [A1]
4963 lea T1, [.imm0 xWrtRIP]
4964 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4965 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4966 %else
4967 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4968 %endif
4969 lea T1, [T1 + T0*2]
4970 IBT_NOTRACK
4971 call T1
4972 movdqu [A0], xmm0
4973
4974 IEMIMPL_SSE_EPILOGUE
4975 EPILOGUE_3_ARGS
4976 %assign bImm 0
4977 %rep 256
4978.imm %+ bImm:
4979 IBT_ENDBRxx_WITHOUT_NOTRACK
4980 shufpd xmm0, xmm1, bImm
4981 ret
4982 %assign bImm bImm + 1
4983 %endrep
4984.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4985ENDPROC iemAImpl_shufpd_u128
4986
4987
4988;;
4989; vshufp{s,d} instructions with 8-bit immediates.
4990;
4991; @param 1 The instruction name.
4992;
4993; @param A0 Pointer to the destination media register size operand (output).
4994; @param A1 Pointer to the first source media register size operand (input).
4995; @param A2 Pointer to the second source media register size operand (input).
4996; @param A3 The 8-bit immediate
4997;
4998%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5000 PROLOGUE_4_ARGS
5001 IEMIMPL_AVX_PROLOGUE
5002
5003 movdqu xmm0, [A1]
5004 movdqu xmm1, [A2]
5005 lea T1, [.imm0 xWrtRIP]
5006 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5007 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5008 %else
5009 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5010 %endif
5011 lea T1, [T1 + T0*2]
5012 IBT_NOTRACK
5013 call T1
5014 movdqu [A0], xmm0
5015
5016 IEMIMPL_AVX_EPILOGUE
5017 EPILOGUE_4_ARGS
5018 %assign bImm 0
5019 %rep 256
5020.imm %+ bImm:
5021 IBT_ENDBRxx_WITHOUT_NOTRACK
5022 %1 xmm0, xmm0, xmm1, bImm
5023 ret
5024 %assign bImm bImm + 1
5025 %endrep
5026.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5027ENDPROC iemAImpl_ %+ %1 %+ _u128
5028
5029BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5030 PROLOGUE_4_ARGS
5031 IEMIMPL_AVX_PROLOGUE
5032
5033 vmovdqu ymm0, [A1]
5034 vmovdqu ymm1, [A2]
5035 lea T1, [.imm0 xWrtRIP]
5036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5037 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5038 %else
5039 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5040 %endif
5041 lea T1, [T1 + T0*2]
5042 IBT_NOTRACK
5043 call T1
5044 vmovdqu [A0], ymm0
5045
5046 IEMIMPL_AVX_EPILOGUE
5047 EPILOGUE_4_ARGS
5048 %assign bImm 0
5049 %rep 256
5050.imm %+ bImm:
5051 IBT_ENDBRxx_WITHOUT_NOTRACK
5052 %1 ymm0, ymm0, ymm1, bImm
5053 ret
5054 %assign bImm bImm + 1
5055 %endrep
5056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5057ENDPROC iemAImpl_ %+ %1 %+ _u256
5058%endmacro
5059
5060IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5061IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5062
5063
5064;;
5065; One of the [p]blendv{b,ps,pd} variants
5066;
5067; @param 1 The instruction
5068;
5069; @param A0 Pointer to the first media register sized operand (input/output).
5070; @param A1 Pointer to the second media sized value (input).
5071; @param A2 Pointer to the media register sized mask value (input).
5072;
5073%macro IEMIMPL_P_BLEND 1
5074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5075 PROLOGUE_3_ARGS
5076 IEMIMPL_SSE_PROLOGUE
5077
5078 movdqu xmm0, [A2] ; This is implicit
5079 movdqu xmm1, [A0]
5080 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5081 %1 xmm1, xmm2
5082 movdqu [A0], xmm1
5083
5084 IEMIMPL_SSE_PROLOGUE
5085 EPILOGUE_3_ARGS
5086ENDPROC iemAImpl_ %+ %1 %+ _u128
5087%endmacro
5088
5089IEMIMPL_P_BLEND pblendvb
5090IEMIMPL_P_BLEND blendvps
5091IEMIMPL_P_BLEND blendvpd
5092
5093
5094;;
5095; One of the v[p]blendv{b,ps,pd} variants
5096;
5097; @param 1 The instruction
5098;
5099; @param A0 Pointer to the first media register sized operand (output).
5100; @param A1 Pointer to the first media register sized operand (input).
5101; @param A2 Pointer to the second media register sized operand (input).
5102; @param A3 Pointer to the media register sized mask value (input).
5103%macro IEMIMPL_AVX_P_BLEND 1
5104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5105 PROLOGUE_4_ARGS
5106 IEMIMPL_AVX_PROLOGUE
5107
5108 vmovdqu xmm0, [A1]
5109 vmovdqu xmm1, [A2]
5110 vmovdqu xmm2, [A3]
5111 %1 xmm0, xmm0, xmm1, xmm2
5112 vmovdqu [A0], xmm0
5113
5114 IEMIMPL_AVX_PROLOGUE
5115 EPILOGUE_4_ARGS
5116ENDPROC iemAImpl_ %+ %1 %+ _u128
5117
5118BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5119 PROLOGUE_4_ARGS
5120 IEMIMPL_AVX_PROLOGUE
5121
5122 vmovdqu ymm0, [A1]
5123 vmovdqu ymm1, [A2]
5124 vmovdqu ymm2, [A3]
5125 %1 ymm0, ymm0, ymm1, ymm2
5126 vmovdqu [A0], ymm0
5127
5128 IEMIMPL_AVX_PROLOGUE
5129 EPILOGUE_4_ARGS
5130ENDPROC iemAImpl_ %+ %1 %+ _u256
5131%endmacro
5132
5133IEMIMPL_AVX_P_BLEND vpblendvb
5134IEMIMPL_AVX_P_BLEND vblendvps
5135IEMIMPL_AVX_P_BLEND vblendvpd
5136
5137
5138;;
5139; palignr mm1, mm2/m64 instruction.
5140;
5141; @param A0 Pointer to the first media register sized operand (output).
5142; @param A1 The second register sized operand (input).
5143; @param A2 The 8-bit immediate.
5144BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5145 PROLOGUE_3_ARGS
5146 IEMIMPL_MMX_PROLOGUE
5147
5148 movq mm0, [A0]
5149 movq mm1, A1
5150 lea T1, [.imm0 xWrtRIP]
5151 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5152 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5153 %else
5154 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5155 %endif
5156 lea T1, [T1 + T0*2]
5157 IBT_NOTRACK
5158 call T1
5159 movq [A0], mm0
5160
5161 IEMIMPL_MMX_EPILOGUE
5162 EPILOGUE_3_ARGS
5163 %assign bImm 0
5164 %rep 256
5165.imm %+ bImm:
5166 IBT_ENDBRxx_WITHOUT_NOTRACK
5167 palignr mm0, mm1, bImm
5168 ret
5169 %assign bImm bImm + 1
5170 %endrep
5171.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5172ENDPROC iemAImpl_palignr_u64
5173
5174
5175;;
5176; SSE instructions with 8-bit immediates of the form
5177; xxx xmm1, xmm2, imm8.
5178; where the instruction encoding takes up 6 bytes.
5179;
5180; @param 1 The instruction name.
5181;
5182; @param A0 Pointer to the first media register size operand (input/output).
5183; @param A1 Pointer to the second source media register size operand (input).
5184; @param A2 The 8-bit immediate
5185;
5186%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5187BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5188 PROLOGUE_3_ARGS
5189 IEMIMPL_SSE_PROLOGUE
5190
5191 movdqu xmm0, [A0]
5192 movdqu xmm1, [A1]
5193 lea T1, [.imm0 xWrtRIP]
5194 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5195 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5196 lea T1, [T1 + T0*4]
5197 %else
5198 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5199 %endif
5200 IBT_NOTRACK
5201 call T1
5202 movdqu [A0], xmm0
5203
5204 IEMIMPL_SSE_EPILOGUE
5205 EPILOGUE_3_ARGS
5206 %assign bImm 0
5207 %rep 256
5208.imm %+ bImm:
5209 IBT_ENDBRxx_WITHOUT_NOTRACK
5210 %1 xmm0, xmm1, bImm
5211 ret
5212 int3
5213 %assign bImm bImm + 1
5214 %endrep
5215.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5216ENDPROC iemAImpl_ %+ %1 %+ _u128
5217%endmacro
5218
5219IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5220IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5221IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5222IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5223IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5224IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5225IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5226
5227
5228;;
5229; AVX instructions with 8-bit immediates of the form
5230; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5231; where the instruction encoding takes up 6 bytes.
5232;
5233; @param 1 The instruction name.
5234; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5235;
5236; @param A0 Pointer to the destination media register size operand (output).
5237; @param A1 Pointer to the first source media register size operand (input).
5238; @param A2 Pointer to the second source media register size operand (input).
5239; @param A3 The 8-bit immediate
5240;
5241%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5242BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5243 PROLOGUE_4_ARGS
5244 IEMIMPL_AVX_PROLOGUE
5245
5246 movdqu xmm0, [A1]
5247 movdqu xmm1, [A2]
5248 lea T1, [.imm0 xWrtRIP]
5249 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5250 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5251 lea T1, [T1 + T0*4]
5252 %else
5253 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5254 %endif
5255 IBT_NOTRACK
5256 call T1
5257 movdqu [A0], xmm0
5258
5259 IEMIMPL_AVX_EPILOGUE
5260 EPILOGUE_4_ARGS
5261 %assign bImm 0
5262 %rep 256
5263.imm %+ bImm:
5264 IBT_ENDBRxx_WITHOUT_NOTRACK
5265 %1 xmm0, xmm0, xmm1, bImm
5266 ret
5267 int3
5268 %assign bImm bImm + 1
5269 %endrep
5270.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5271ENDPROC iemAImpl_ %+ %1 %+ _u128
5272
5273 %if %2 == 1
5274BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5275 PROLOGUE_4_ARGS
5276 IEMIMPL_AVX_PROLOGUE
5277
5278 vmovdqu ymm0, [A1]
5279 vmovdqu ymm1, [A2]
5280 lea T1, [.imm0 xWrtRIP]
5281 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5282 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5283 lea T1, [T1 + T0*4]
5284 %else
5285 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5286 %endif
5287 IBT_NOTRACK
5288 call T1
5289 vmovdqu [A0], ymm0
5290
5291 IEMIMPL_AVX_EPILOGUE
5292 EPILOGUE_4_ARGS
5293 %assign bImm 0
5294 %rep 256
5295.imm %+ bImm:
5296 IBT_ENDBRxx_WITHOUT_NOTRACK
5297 %1 ymm0, ymm0, ymm1, bImm
5298 ret
5299 int3
5300 %assign bImm bImm + 1
5301 %endrep
5302.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5303ENDPROC iemAImpl_ %+ %1 %+ _u256
5304 %endif
5305%endmacro
5306
5307IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5308IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5309IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5310IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5311IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5312
5313
5314;;
5315; Need to move this as well somewhere better?
5316;
5317struc IEMPCMPISTRXSRC
5318 .uSrc1 resd 4
5319 .uSrc2 resd 4
5320endstruc
5321
5322struc IEMPCMPESTRXSRC
5323 .uSrc1 resd 4
5324 .uSrc2 resd 4
5325 .u64Rax resd 2
5326 .u64Rdx resd 2
5327endstruc
5328
5329;;
5330; The pcmpistri instruction.
5331;
5332; @param A0 Pointer to the ECX register to store the result to (output).
5333; @param A1 Pointer to the EFLAGS register.
5334; @param A2 Pointer to the structure containing the source operands (input).
5335; @param A3 The 8-bit immediate
5336;
5337BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5338 PROLOGUE_4_ARGS
5339 IEMIMPL_SSE_PROLOGUE
5340
5341 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5342 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5343 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5344 lea T1, [.imm0 xWrtRIP]
5345 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5346 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5347 lea T1, [T1 + T0*4]
5348 %else
5349 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5350 %endif
5351 IBT_NOTRACK
5352 call T1
5353
5354 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5355 mov [T2], ecx
5356
5357 IEMIMPL_SSE_EPILOGUE
5358 EPILOGUE_4_ARGS
5359 %assign bImm 0
5360 %rep 256
5361.imm %+ bImm:
5362 IBT_ENDBRxx_WITHOUT_NOTRACK
5363 pcmpistri xmm0, xmm1, bImm
5364 ret
5365 int3
5366 %assign bImm bImm + 1
5367 %endrep
5368.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5369ENDPROC iemAImpl_pcmpistri_u128
5370
5371;;
5372; The pcmpestri instruction.
5373;
5374; @param A0 Pointer to the ECX register to store the result to (output).
5375; @param A1 Pointer to the EFLAGS register.
5376; @param A2 Pointer to the structure containing the source operands (input).
5377; @param A3 The 8-bit immediate
5378;
5379BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5380 PROLOGUE_4_ARGS
5381 IEMIMPL_SSE_PROLOGUE
5382
5383 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5384 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5385 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5386 lea T1, [.imm0 xWrtRIP]
5387 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5388 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5389 lea T1, [T1 + T0*4]
5390 %else
5391 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5392 %endif
5393 push xDX ; xDX can be A1 or A2 depending on the calling convention
5394 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5395 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5396 IBT_NOTRACK
5397 call T1
5398
5399 pop xDX
5400 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5401 mov [T2], ecx
5402
5403 IEMIMPL_SSE_EPILOGUE
5404 EPILOGUE_4_ARGS
5405 %assign bImm 0
5406 %rep 256
5407.imm %+ bImm:
5408 IBT_ENDBRxx_WITHOUT_NOTRACK
5409 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5410 pcmpestri xmm0, xmm1, bImm
5411 ret
5412 %assign bImm bImm + 1
5413 %endrep
5414.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5415ENDPROC iemAImpl_pcmpestri_u128
5416
5417;;
5418; The pcmpistrm instruction template.
5419;
5420; @param A0 Pointer to the XMM0 register to store the result to (output).
5421; @param A1 Pointer to the EFLAGS register.
5422; @param A2 Pointer to the structure containing the source operands (input).
5423; @param A3 The 8-bit immediate
5424;
5425BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5426 PROLOGUE_4_ARGS
5427 IEMIMPL_SSE_PROLOGUE
5428
5429 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5430 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5431 lea T1, [.imm0 xWrtRIP]
5432 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5433 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5434 lea T1, [T1 + T0*4]
5435 %else
5436 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5437 %endif
5438 IBT_NOTRACK
5439 call T1
5440
5441 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5442 movdqu [A0], xmm0
5443
5444 IEMIMPL_SSE_EPILOGUE
5445 EPILOGUE_4_ARGS
5446 %assign bImm 0
5447 %rep 256
5448.imm %+ bImm:
5449 IBT_ENDBRxx_WITHOUT_NOTRACK
5450 pcmpistrm xmm1, xmm2, bImm
5451 ret
5452 int3
5453 %assign bImm bImm + 1
5454 %endrep
5455.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5456ENDPROC iemAImpl_pcmpistrm_u128
5457
5458;;
5459; The pcmpestrm instruction template.
5460;
5461; @param A0 Pointer to the XMM0 register to store the result to (output).
5462; @param A1 Pointer to the EFLAGS register.
5463; @param A2 Pointer to the structure containing the source operands (input).
5464; @param A3 The 8-bit immediate
5465;
5466BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5467 PROLOGUE_4_ARGS
5468 IEMIMPL_SSE_PROLOGUE
5469
5470 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5471 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5472 lea T1, [.imm0 xWrtRIP]
5473 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5474 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5475 lea T1, [T1 + T0*4]
5476 %else
5477 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5478 %endif
5479 push xDX ; xDX can be A1 or A2 depending on the calling convention
5480 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5481 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5482 IBT_NOTRACK
5483 call T1
5484
5485 pop xDX
5486 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5487 movdqu [A0], xmm0
5488
5489 IEMIMPL_SSE_EPILOGUE
5490 EPILOGUE_4_ARGS
5491 %assign bImm 0
5492 %rep 256
5493.imm %+ bImm:
5494 IBT_ENDBRxx_WITHOUT_NOTRACK
5495 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5496 pcmpestrm xmm1, xmm2, bImm
5497 ret
5498 %assign bImm bImm + 1
5499 %endrep
5500.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5501ENDPROC iemAImpl_pcmpestrm_u128
5502
5503
5504;;
5505; pinsrw instruction.
5506;
5507; @param A0 Pointer to the first media register size operand (input/output).
5508; @param A1 The 16 bit input operand (input).
5509; @param A2 The 8-bit immediate
5510;
5511BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5512 PROLOGUE_3_ARGS
5513 IEMIMPL_SSE_PROLOGUE
5514
5515 movq mm0, [A0]
5516 lea T1, [.imm0 xWrtRIP]
5517 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5518 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5519 %else
5520 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5521 %endif
5522 lea T1, [T1 + T0]
5523 IBT_NOTRACK
5524 call T1
5525 movq [A0], mm0
5526
5527 IEMIMPL_SSE_EPILOGUE
5528 EPILOGUE_3_ARGS
5529 %assign bImm 0
5530 %rep 256
5531.imm %+ bImm:
5532 IBT_ENDBRxx_WITHOUT_NOTRACK
5533 pinsrw mm0, A1_32, bImm
5534 ret
5535 %assign bImm bImm + 1
5536 %endrep
5537.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5538ENDPROC iemAImpl_pinsrw_u64
5539
5540BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5541 PROLOGUE_3_ARGS
5542 IEMIMPL_SSE_PROLOGUE
5543
5544 movdqu xmm0, [A0]
5545 lea T1, [.imm0 xWrtRIP]
5546 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5547 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5548 %else
5549 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5550 %endif
5551 lea T1, [T1 + T0*2]
5552 IBT_NOTRACK
5553 call T1
5554 movdqu [A0], xmm0
5555
5556 IEMIMPL_SSE_EPILOGUE
5557 EPILOGUE_3_ARGS
5558 %assign bImm 0
5559 %rep 256
5560.imm %+ bImm:
5561 IBT_ENDBRxx_WITHOUT_NOTRACK
5562 pinsrw xmm0, A1_32, bImm
5563 ret
5564 %assign bImm bImm + 1
5565 %endrep
5566.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5567ENDPROC iemAImpl_pinsrw_u128
5568
5569;;
5570; vpinsrw instruction.
5571;
5572; @param A0 Pointer to the first media register size operand (output).
5573; @param A1 Pointer to the source media register size operand (input).
5574; @param A2 The 16 bit input operand (input).
5575; @param A3 The 8-bit immediate
5576;
5577BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5578 PROLOGUE_4_ARGS
5579 IEMIMPL_SSE_PROLOGUE
5580
5581 movdqu xmm0, [A1]
5582 lea T1, [.imm0 xWrtRIP]
5583 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5584 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5585 %else
5586 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5587 %endif
5588 lea T1, [T1 + T0*2]
5589 mov A1, A2 ; A2 requires longer encoding on Windows
5590 IBT_NOTRACK
5591 call T1
5592 movdqu [A0], xmm0
5593
5594 IEMIMPL_SSE_EPILOGUE
5595 EPILOGUE_4_ARGS
5596 %assign bImm 0
5597 %rep 256
5598.imm %+ bImm:
5599 IBT_ENDBRxx_WITHOUT_NOTRACK
5600 vpinsrw xmm0, xmm0, A1_32, bImm
5601 ret
5602 %assign bImm bImm + 1
5603 %endrep
5604.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5605ENDPROC iemAImpl_vpinsrw_u128
5606
5607
5608;;
5609; pextrw instruction.
5610;
5611; @param A0 Pointer to the 16bit output operand (output).
5612; @param A1 Pointer to the media register size operand (input).
5613; @param A2 The 8-bit immediate
5614;
5615BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5616 PROLOGUE_3_ARGS
5617 IEMIMPL_SSE_PROLOGUE
5618
5619 movq mm0, A1
5620 lea T1, [.imm0 xWrtRIP]
5621 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5622 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5623 %else
5624 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5625 %endif
5626 lea T1, [T1 + T0]
5627 IBT_NOTRACK
5628 call T1
5629 mov word [A0], T0_16
5630
5631 IEMIMPL_SSE_EPILOGUE
5632 EPILOGUE_3_ARGS
5633 %assign bImm 0
5634 %rep 256
5635.imm %+ bImm:
5636 IBT_ENDBRxx_WITHOUT_NOTRACK
5637 pextrw T0_32, mm0, bImm
5638 ret
5639 %assign bImm bImm + 1
5640 %endrep
5641.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5642ENDPROC iemAImpl_pextrw_u64
5643
5644BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5645 PROLOGUE_3_ARGS
5646 IEMIMPL_SSE_PROLOGUE
5647
5648 movdqu xmm0, [A1]
5649 lea T1, [.imm0 xWrtRIP]
5650 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5651 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5652 %else
5653 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5654 %endif
5655 lea T1, [T1 + T0*2]
5656 IBT_NOTRACK
5657 call T1
5658 mov word [A0], T0_16
5659
5660 IEMIMPL_SSE_EPILOGUE
5661 EPILOGUE_3_ARGS
5662 %assign bImm 0
5663 %rep 256
5664.imm %+ bImm:
5665 IBT_ENDBRxx_WITHOUT_NOTRACK
5666 pextrw T0_32, xmm0, bImm
5667 ret
5668 %assign bImm bImm + 1
5669 %endrep
5670.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5671ENDPROC iemAImpl_pextrw_u128
5672
5673;;
5674; vpextrw instruction.
5675;
5676; @param A0 Pointer to the 16bit output operand (output).
5677; @param A1 Pointer to the source media register size operand (input).
5678; @param A2 The 8-bit immediate
5679;
5680BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5681 PROLOGUE_3_ARGS
5682 IEMIMPL_SSE_PROLOGUE
5683
5684 movdqu xmm0, [A1]
5685 lea T1, [.imm0 xWrtRIP]
5686 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5687 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5688 %else
5689 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5690 %endif
5691 lea T1, [T1 + T0*2]
5692 IBT_NOTRACK
5693 call T1
5694 mov word [A0], T0_16
5695
5696 IEMIMPL_SSE_EPILOGUE
5697 EPILOGUE_3_ARGS
5698 %assign bImm 0
5699 %rep 256
5700.imm %+ bImm:
5701 IBT_ENDBRxx_WITHOUT_NOTRACK
5702 vpextrw T0_32, xmm0, bImm
5703 ret
5704 %assign bImm bImm + 1
5705 %endrep
5706.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5707ENDPROC iemAImpl_vpextrw_u128
5708
5709
5710;;
5711; movmskp{s,d} SSE instruction template
5712;
5713; @param 1 The SSE instruction name.
5714; @param 2 The AVX instruction name.
5715;
5716; @param A0 Pointer to the output register (output/byte sized).
5717; @param A1 Pointer to the source media register size operand (input).
5718;
5719%macro IEMIMPL_MEDIA_MOVMSK_P 2
5720BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5721 PROLOGUE_2_ARGS
5722 IEMIMPL_SSE_PROLOGUE
5723
5724 movdqu xmm0, [A1]
5725 %1 T0, xmm0
5726 mov byte [A0], T0_8
5727
5728 IEMIMPL_SSE_EPILOGUE
5729 EPILOGUE_2_ARGS
5730ENDPROC iemAImpl_ %+ %1 %+ _u128
5731
5732BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5733 PROLOGUE_2_ARGS
5734 IEMIMPL_AVX_PROLOGUE
5735
5736 movdqu xmm0, [A1]
5737 %2 T0, xmm0
5738 mov byte [A0], T0_8
5739
5740 IEMIMPL_AVX_EPILOGUE
5741 EPILOGUE_2_ARGS
5742ENDPROC iemAImpl_ %+ %2 %+ _u128
5743
5744BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5745 PROLOGUE_2_ARGS
5746 IEMIMPL_AVX_PROLOGUE
5747
5748 vmovdqu ymm0, [A1]
5749 %2 T0, ymm0
5750 mov byte [A0], T0_8
5751
5752 IEMIMPL_AVX_EPILOGUE
5753 EPILOGUE_2_ARGS
5754ENDPROC iemAImpl_ %+ %2 %+ _u256
5755%endmacro
5756
5757IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5758IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5759
5760
5761;;
5762; Restores the SSE MXCSR register with the original value.
5763;
5764; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5765; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5766; @param 2 Expression giving the address of the FXSTATE of the guest.
5767;
5768; @note Restores the stack pointer.
5769;
5770%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5771 sub xSP, 4
5772 stmxcsr [xSP]
5773 mov T0_32, [xSP]
5774 add xSP, 4
5775 ; Merge the status bits into the original MXCSR value.
5776 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5777 and T0_32, X86_MXCSR_XCPT_FLAGS
5778 or T0_32, T1_32
5779 mov [%1], T0_32
5780
5781 ldmxcsr [xSP]
5782 add xSP, 4
5783%endmacro
5784
5785
5786;;
5787; cvttsd2si instruction - 32-bit variant.
5788;
5789; @param A0 FPU context (FXSTATE or XSAVEAREA).
5790; @param A1 Where to return the MXCSR value.
5791; @param A2 Pointer to the result operand (output).
5792; @param A3 Pointer to the second operand (input).
5793;
5794BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5795 PROLOGUE_4_ARGS
5796 IEMIMPL_SSE_PROLOGUE
5797 SSE_LD_FXSTATE_MXCSR A0
5798
5799 cvttsd2si T0_32, [A3]
5800 mov dword [A2], T0_32
5801
5802 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5803 IEMIMPL_SSE_EPILOGUE
5804 EPILOGUE_4_ARGS
5805ENDPROC iemAImpl_cvttsd2si_i32_r64
5806
5807;;
5808; cvttsd2si instruction - 64-bit variant.
5809;
5810; @param A0 FPU context (FXSTATE or XSAVEAREA).
5811; @param A1 Where to return the MXCSR value.
5812; @param A2 Pointer to the result operand (output).
5813; @param A3 Pointer to the second operand (input).
5814;
5815BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5816 PROLOGUE_4_ARGS
5817 IEMIMPL_SSE_PROLOGUE
5818 SSE_LD_FXSTATE_MXCSR A0
5819
5820 cvttsd2si T0, [A3]
5821 mov qword [A2], T0
5822
5823 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5824 IEMIMPL_SSE_EPILOGUE
5825 EPILOGUE_4_ARGS
5826ENDPROC iemAImpl_cvttsd2si_i64_r64
5827
5828
5829;;
5830; cvtsd2si instruction - 32-bit variant.
5831;
5832; @param A0 FPU context (FXSTATE or XSAVEAREA).
5833; @param A1 Where to return the MXCSR value.
5834; @param A2 Pointer to the result operand (output).
5835; @param A3 Pointer to the second operand (input).
5836;
5837BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5838 PROLOGUE_4_ARGS
5839 IEMIMPL_SSE_PROLOGUE
5840 SSE_LD_FXSTATE_MXCSR A0
5841
5842 cvtsd2si T0_32, [A3]
5843 mov dword [A2], T0_32
5844
5845 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5846 IEMIMPL_SSE_EPILOGUE
5847 EPILOGUE_4_ARGS
5848ENDPROC iemAImpl_cvtsd2si_i32_r64
5849
5850;;
5851; cvtsd2si instruction - 64-bit variant.
5852;
5853; @param A0 FPU context (FXSTATE or XSAVEAREA).
5854; @param A1 Where to return the MXCSR value.
5855; @param A2 Pointer to the result operand (output).
5856; @param A3 Pointer to the second operand (input).
5857;
5858BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5859 PROLOGUE_4_ARGS
5860 IEMIMPL_SSE_PROLOGUE
5861 SSE_LD_FXSTATE_MXCSR A0
5862
5863 cvtsd2si T0, [A3]
5864 mov qword [A2], T0
5865
5866 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5867 IEMIMPL_SSE_EPILOGUE
5868 EPILOGUE_4_ARGS
5869ENDPROC iemAImpl_cvtsd2si_i64_r64
5870
5871
5872;;
5873; cvttss2si instruction - 32-bit variant.
5874;
5875; @param A0 FPU context (FXSTATE or XSAVEAREA).
5876; @param A1 Where to return the MXCSR value.
5877; @param A2 Pointer to the result operand (output).
5878; @param A3 Pointer to the second operand (input).
5879;
5880BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5881 PROLOGUE_4_ARGS
5882 IEMIMPL_SSE_PROLOGUE
5883 SSE_LD_FXSTATE_MXCSR A0
5884
5885 cvttss2si T0_32, [A3]
5886 mov dword [A2], T0_32
5887
5888 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5889 IEMIMPL_SSE_EPILOGUE
5890 EPILOGUE_4_ARGS
5891ENDPROC iemAImpl_cvttss2si_i32_r32
5892
5893;;
5894; cvttss2si instruction - 64-bit variant.
5895;
5896; @param A0 FPU context (FXSTATE or XSAVEAREA).
5897; @param A1 Where to return the MXCSR value.
5898; @param A2 Pointer to the result operand (output).
5899; @param A3 Pointer to the second operand (input).
5900;
5901BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5902 PROLOGUE_4_ARGS
5903 IEMIMPL_SSE_PROLOGUE
5904 SSE_LD_FXSTATE_MXCSR A0
5905
5906 cvttss2si T0, [A3]
5907 mov qword [A2], T0
5908
5909 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5910 IEMIMPL_SSE_EPILOGUE
5911 EPILOGUE_4_ARGS
5912ENDPROC iemAImpl_cvttss2si_i64_r32
5913
5914
5915;;
5916; cvtss2si instruction - 32-bit variant.
5917;
5918; @param A0 FPU context (FXSTATE or XSAVEAREA).
5919; @param A1 Where to return the MXCSR value.
5920; @param A2 Pointer to the result operand (output).
5921; @param A3 Pointer to the second operand (input).
5922;
5923BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5924 PROLOGUE_4_ARGS
5925 IEMIMPL_SSE_PROLOGUE
5926 SSE_LD_FXSTATE_MXCSR A0
5927
5928 cvtss2si T0_32, [A3]
5929 mov dword [A2], T0_32
5930
5931 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5932 IEMIMPL_SSE_EPILOGUE
5933 EPILOGUE_4_ARGS
5934ENDPROC iemAImpl_cvtss2si_i32_r32
5935
5936;;
5937; cvtss2si instruction - 64-bit variant.
5938;
5939; @param A0 FPU context (FXSTATE or XSAVEAREA).
5940; @param A1 Where to return the MXCSR value.
5941; @param A2 Pointer to the result operand (output).
5942; @param A3 Pointer to the second operand (input).
5943;
5944BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5945 PROLOGUE_4_ARGS
5946 IEMIMPL_SSE_PROLOGUE
5947 SSE_LD_FXSTATE_MXCSR A0
5948
5949 cvtss2si T0, [A3]
5950 mov qword [A2], T0
5951
5952 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5953 IEMIMPL_SSE_EPILOGUE
5954 EPILOGUE_4_ARGS
5955ENDPROC iemAImpl_cvtss2si_i64_r32
5956
5957
5958;;
5959; cvtsi2ss instruction - 32-bit variant.
5960;
5961; @param A0 FPU context (FXSTATE or XSAVEAREA).
5962; @param A1 Where to return the MXCSR value.
5963; @param A2 Pointer to the result operand (output).
5964; @param A3 Pointer to the second operand (input).
5965;
5966BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5967 PROLOGUE_4_ARGS
5968 IEMIMPL_SSE_PROLOGUE
5969 SSE_LD_FXSTATE_MXCSR A0
5970
5971 cvtsi2ss xmm0, dword [A3]
5972 movd dword [A2], xmm0
5973
5974 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5975 IEMIMPL_SSE_EPILOGUE
5976 EPILOGUE_4_ARGS
5977ENDPROC iemAImpl_cvtsi2ss_r32_i32
5978
5979;;
5980; cvtsi2ss instruction - 64-bit variant.
5981;
5982; @param A0 FPU context (FXSTATE or XSAVEAREA).
5983; @param A1 Where to return the MXCSR value.
5984; @param A2 Pointer to the result operand (output).
5985; @param A3 Pointer to the second operand (input).
5986;
5987BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5988 PROLOGUE_4_ARGS
5989 IEMIMPL_SSE_PROLOGUE
5990 SSE_LD_FXSTATE_MXCSR A0
5991
5992 cvtsi2ss xmm0, qword [A3]
5993 movd dword [A2], xmm0
5994
5995 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5996 IEMIMPL_SSE_EPILOGUE
5997 EPILOGUE_4_ARGS
5998ENDPROC iemAImpl_cvtsi2ss_r32_i64
5999
6000
6001;;
6002; cvtsi2sd instruction - 32-bit variant.
6003;
6004; @param A0 FPU context (FXSTATE or XSAVEAREA).
6005; @param A1 Where to return the MXCSR value.
6006; @param A2 Pointer to the result operand (output).
6007; @param A3 Pointer to the second operand (input).
6008;
6009BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6010 PROLOGUE_4_ARGS
6011 IEMIMPL_SSE_PROLOGUE
6012 SSE_LD_FXSTATE_MXCSR A0
6013
6014 cvtsi2sd xmm0, dword [A3]
6015 movq [A2], xmm0
6016
6017 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6018 IEMIMPL_SSE_EPILOGUE
6019 EPILOGUE_4_ARGS
6020ENDPROC iemAImpl_cvtsi2sd_r64_i32
6021
6022;;
6023; cvtsi2sd instruction - 64-bit variant.
6024;
6025; @param A0 FPU context (FXSTATE or XSAVEAREA).
6026; @param A1 Where to return the MXCSR value.
6027; @param A2 Pointer to the result operand (output).
6028; @param A3 Pointer to the second operand (input).
6029;
6030BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6031 PROLOGUE_4_ARGS
6032 IEMIMPL_SSE_PROLOGUE
6033 SSE_LD_FXSTATE_MXCSR A0
6034
6035 cvtsi2sd xmm0, qword [A3]
6036 movq [A2], xmm0
6037
6038 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6039 IEMIMPL_SSE_EPILOGUE
6040 EPILOGUE_4_ARGS
6041ENDPROC iemAImpl_cvtsi2sd_r64_i64
6042
6043
6044;;
6045; Initialize the SSE MXCSR register using the guest value partially to
6046; account for rounding mode.
6047;
6048; @uses 4 bytes of stack to save the original value, T0.
6049; @param 1 Expression giving the address of the MXCSR register of the guest.
6050;
6051%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6052 sub xSP, 4
6053
6054 stmxcsr [xSP]
6055 mov T0_32, [%1]
6056 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6057 or T0_32, X86_MXCSR_XCPT_MASK
6058 sub xSP, 4
6059 mov [xSP], T0_32
6060 ldmxcsr [xSP]
6061 add xSP, 4
6062%endmacro
6063
6064
6065;;
6066; Restores the SSE MXCSR register with the original value.
6067;
6068; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6069; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6070;
6071; @note Restores the stack pointer.
6072;
6073%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6074 sub xSP, 4
6075 stmxcsr [xSP]
6076 mov T0_32, [xSP]
6077 add xSP, 4
6078 ; Merge the status bits into the original MXCSR value.
6079 mov T1_32, [%1]
6080 and T0_32, X86_MXCSR_XCPT_FLAGS
6081 or T0_32, T1_32
6082 mov [%1], T0_32
6083
6084 ldmxcsr [xSP]
6085 add xSP, 4
6086%endmacro
6087
6088
6089;
6090; UCOMISS (SSE)
6091;
6092; @param A0 Pointer to the MXCSR value (input/output).
6093; @param A1 Pointer to the EFLAGS value (input/output).
6094; @param A2 Pointer to the first source operand (aka readonly destination).
6095; @param A3 Pointer to the second source operand.
6096;
6097BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6098 PROLOGUE_4_ARGS
6099 IEMIMPL_SSE_PROLOGUE
6100 SSE_LD_FXSTATE_MXCSR_ONLY A0
6101
6102 movdqu xmm0, [A2]
6103 movdqu xmm1, [A3]
6104 ucomiss xmm0, xmm1
6105 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6106
6107 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6108 IEMIMPL_SSE_EPILOGUE
6109 EPILOGUE_4_ARGS
6110ENDPROC iemAImpl_ucomiss_u128
6111
6112BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6113 PROLOGUE_4_ARGS
6114 IEMIMPL_SSE_PROLOGUE
6115 SSE_LD_FXSTATE_MXCSR_ONLY A0
6116
6117 movdqu xmm0, [A2]
6118 movdqu xmm1, [A3]
6119 vucomiss xmm0, xmm1
6120 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6121
6122 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6123 IEMIMPL_SSE_EPILOGUE
6124 EPILOGUE_4_ARGS
6125ENDPROC iemAImpl_vucomiss_u128
6126
6127
6128;
6129; UCOMISD (SSE)
6130;
6131; @param A0 Pointer to the MXCSR value (input/output).
6132; @param A1 Pointer to the EFLAGS value (input/output).
6133; @param A2 Pointer to the first source operand (aka readonly destination).
6134; @param A3 Pointer to the second source operand.
6135;
6136BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6137 PROLOGUE_4_ARGS
6138 IEMIMPL_SSE_PROLOGUE
6139 SSE_LD_FXSTATE_MXCSR_ONLY A0
6140
6141 movdqu xmm0, [A2]
6142 movdqu xmm1, [A3]
6143 ucomisd xmm0, xmm1
6144 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6145
6146 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6147 IEMIMPL_SSE_EPILOGUE
6148 EPILOGUE_4_ARGS
6149ENDPROC iemAImpl_ucomisd_u128
6150
6151BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6152 PROLOGUE_4_ARGS
6153 IEMIMPL_SSE_PROLOGUE
6154 SSE_LD_FXSTATE_MXCSR_ONLY A0
6155
6156 movdqu xmm0, [A2]
6157 movdqu xmm1, [A3]
6158 vucomisd xmm0, xmm1
6159 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6160
6161 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6162 IEMIMPL_SSE_EPILOGUE
6163 EPILOGUE_4_ARGS
6164ENDPROC iemAImpl_vucomisd_u128
6165
6166;
6167; COMISS (SSE)
6168;
6169; @param A0 Pointer to the MXCSR value (input/output).
6170; @param A1 Pointer to the EFLAGS value (input/output).
6171; @param A2 Pointer to the first source operand (aka readonly destination).
6172; @param A3 Pointer to the second source operand.
6173;
6174BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6175 PROLOGUE_4_ARGS
6176 IEMIMPL_SSE_PROLOGUE
6177 SSE_LD_FXSTATE_MXCSR_ONLY A0
6178
6179 movdqu xmm0, [A2]
6180 movdqu xmm1, [A3]
6181 comiss xmm0, xmm1
6182 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6183
6184 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6185 IEMIMPL_SSE_EPILOGUE
6186 EPILOGUE_4_ARGS
6187ENDPROC iemAImpl_comiss_u128
6188
6189BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6190 PROLOGUE_4_ARGS
6191 IEMIMPL_SSE_PROLOGUE
6192 SSE_LD_FXSTATE_MXCSR_ONLY A0
6193
6194 movdqu xmm0, [A2]
6195 movdqu xmm1, [A3]
6196 vcomiss xmm0, xmm1
6197 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6198
6199 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6200 IEMIMPL_SSE_EPILOGUE
6201 EPILOGUE_4_ARGS
6202ENDPROC iemAImpl_vcomiss_u128
6203
6204
6205;
6206; COMISD (SSE)
6207;
6208; @param A0 Pointer to the MXCSR value (input/output).
6209; @param A1 Pointer to the EFLAGS value (input/output).
6210; @param A2 Pointer to the first source operand (aka readonly destination).
6211; @param A3 Pointer to the second source operand.
6212;
6213BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6214 PROLOGUE_4_ARGS
6215 IEMIMPL_SSE_PROLOGUE
6216 SSE_LD_FXSTATE_MXCSR_ONLY A0
6217
6218 movdqu xmm0, [A2]
6219 movdqu xmm1, [A3]
6220 comisd xmm0, xmm1
6221 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6222
6223 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6224 IEMIMPL_SSE_EPILOGUE
6225 EPILOGUE_4_ARGS
6226ENDPROC iemAImpl_comisd_u128
6227
6228BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6229 PROLOGUE_4_ARGS
6230 IEMIMPL_SSE_PROLOGUE
6231 SSE_LD_FXSTATE_MXCSR_ONLY A0
6232
6233 movdqu xmm0, [A2]
6234 movdqu xmm1, [A3]
6235 vcomisd xmm0, xmm1
6236 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6237
6238 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6239 IEMIMPL_SSE_EPILOGUE
6240 EPILOGUE_4_ARGS
6241ENDPROC iemAImpl_vcomisd_u128
6242
6243
6244;;
6245; Need to move this as well somewhere better?
6246;
6247struc IEMMEDIAF2XMMSRC
6248 .uSrc1 resd 4
6249 .uSrc2 resd 4
6250endstruc
6251
6252
6253;
6254; CMPPS (SSE)
6255;
6256; @param A0 Pointer to the MXCSR value (input/output).
6257; @param A1 Pointer to the first media register size operand (output).
6258; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6259; @param A3 The 8-bit immediate (input).
6260;
6261BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6262 PROLOGUE_4_ARGS
6263 IEMIMPL_SSE_PROLOGUE
6264 SSE_LD_FXSTATE_MXCSR_ONLY A0
6265
6266 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6267 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6268 lea T1, [.imm0 xWrtRIP]
6269 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6270 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6271 %else
6272 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6273 %endif
6274 lea T1, [T1 + T0]
6275 IBT_NOTRACK
6276 call T1
6277 movdqu [A1], xmm0
6278
6279 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6280 IEMIMPL_SSE_EPILOGUE
6281 EPILOGUE_4_ARGS
6282 %assign bImm 0
6283 %rep 256
6284.imm %+ bImm:
6285 IBT_ENDBRxx_WITHOUT_NOTRACK
6286 cmpps xmm0, xmm1, bImm
6287 ret
6288 %assign bImm bImm + 1
6289 %endrep
6290.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6291ENDPROC iemAImpl_cmpps_u128
6292
6293;;
6294; SSE instructions with 8-bit immediates of the form
6295; xxx xmm1, xmm2, imm8.
6296; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6297; register.
6298;
6299; @param 1 The instruction name.
6300;
6301; @param A0 Pointer to the MXCSR value (input/output).
6302; @param A1 Pointer to the first media register size operand (output).
6303; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6304; @param A3 The 8-bit immediate (input).
6305;
6306%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6307BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6308 PROLOGUE_4_ARGS
6309 IEMIMPL_SSE_PROLOGUE
6310 SSE_LD_FXSTATE_MXCSR_ONLY A0
6311
6312 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6313 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6314 lea T1, [.imm0 xWrtRIP]
6315 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6316 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6317 %else
6318 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6319 %endif
6320 lea T1, [T1 + T0*2]
6321 IBT_NOTRACK
6322 call T1
6323 movdqu [A1], xmm0
6324
6325 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6326 IEMIMPL_SSE_EPILOGUE
6327 EPILOGUE_4_ARGS
6328 %assign bImm 0
6329 %rep 256
6330.imm %+ bImm:
6331 IBT_ENDBRxx_WITHOUT_NOTRACK
6332 %1 xmm0, xmm1, bImm
6333 ret
6334 %assign bImm bImm + 1
6335 %endrep
6336.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6337ENDPROC iemAImpl_ %+ %1 %+ _u128
6338%endmacro
6339
6340IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6341IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6342IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6343
6344;;
6345; SSE instructions with 8-bit immediates of the form
6346; xxx xmm1, xmm2, imm8.
6347; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6348; register.
6349;
6350; @param 1 The instruction name.
6351;
6352; @param A0 Pointer to the MXCSR value (input/output).
6353; @param A1 Pointer to the first media register size operand (output).
6354; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6355; @param A3 The 8-bit immediate (input).
6356;
6357%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6359 PROLOGUE_4_ARGS
6360 IEMIMPL_SSE_PROLOGUE
6361 SSE_LD_FXSTATE_MXCSR_ONLY A0
6362
6363 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6364 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6365 lea T1, [.imm0 xWrtRIP]
6366 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6367 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6368 lea T1, [T1 + T0*4]
6369 %else
6370 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6371 %endif
6372 IBT_NOTRACK
6373 call T1
6374 movdqu [A1], xmm0
6375
6376 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6377 IEMIMPL_SSE_EPILOGUE
6378 EPILOGUE_4_ARGS
6379 %assign bImm 0
6380 %rep 256
6381.imm %+ bImm:
6382 IBT_ENDBRxx_WITHOUT_NOTRACK
6383 %1 xmm0, xmm1, bImm
6384 ret
6385 int3
6386 %assign bImm bImm + 1
6387 %endrep
6388.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6389ENDPROC iemAImpl_ %+ %1 %+ _u128
6390%endmacro
6391
6392IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6393IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6394IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6395IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6396IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6397IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6398
6399
6400;;
6401; SSE instructions of the form
6402; xxx mm, xmm.
6403; and we need to load and save the MXCSR register.
6404;
6405; @param 1 The instruction name.
6406;
6407; @param A0 Pointer to the MXCSR value (input/output).
6408; @param A1 Pointer to the first MMX register sized operand (output).
6409; @param A2 Pointer to the media register sized operand (input).
6410;
6411%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6413 PROLOGUE_3_ARGS
6414 IEMIMPL_SSE_PROLOGUE
6415 SSE_LD_FXSTATE_MXCSR_ONLY A0
6416
6417 movdqu xmm0, [A2]
6418 %1 mm0, xmm0
6419 movq [A1], mm0
6420
6421 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6422 IEMIMPL_SSE_EPILOGUE
6423 EPILOGUE_3_ARGS
6424ENDPROC iemAImpl_ %+ %1 %+ _u128
6425%endmacro
6426
6427IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6428IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6429
6430;;
6431; SSE instructions of the form
6432; xxx xmm, xmm/m64.
6433; and we need to load and save the MXCSR register.
6434;
6435; @param 1 The instruction name.
6436;
6437; @param A0 Pointer to the MXCSR value (input/output).
6438; @param A1 Pointer to the first media register sized operand (input/output).
6439; @param A2 The 64bit source value from a MMX media register (input)
6440;
6441%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6442BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6443 PROLOGUE_3_ARGS
6444 IEMIMPL_SSE_PROLOGUE
6445 SSE_LD_FXSTATE_MXCSR_ONLY A0
6446
6447 movdqu xmm0, [A1]
6448 movq mm0, A2
6449 %1 xmm0, mm0
6450 movdqu [A1], xmm0
6451
6452 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6453 IEMIMPL_SSE_EPILOGUE
6454 EPILOGUE_3_ARGS
6455ENDPROC iemAImpl_ %+ %1 %+ _u128
6456%endmacro
6457
6458IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6459IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6460
6461;;
6462; SSE instructions of the form
6463; xxx mm, xmm/m64.
6464; and we need to load and save the MXCSR register.
6465;
6466; @param 1 The instruction name.
6467;
6468; @param A0 Pointer to the MXCSR value (input/output).
6469; @param A1 Pointer to the first MMX media register sized operand (output).
6470; @param A2 The 64bit source value (input).
6471;
6472%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6474 PROLOGUE_3_ARGS
6475 IEMIMPL_SSE_PROLOGUE
6476 SSE_LD_FXSTATE_MXCSR_ONLY A0
6477
6478 movq xmm0, A2
6479 %1 mm0, xmm0
6480 movq [A1], mm0
6481
6482 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6483 IEMIMPL_SSE_EPILOGUE
6484 EPILOGUE_3_ARGS
6485ENDPROC iemAImpl_ %+ %1 %+ _u128
6486%endmacro
6487
6488IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6489IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6490
6491;
6492; All forms of RDRAND and RDSEED
6493;
6494; @param A0 Pointer to the destination operand.
6495; @param A1 Pointer to the EFLAGS value (input/output).
6496;
6497%macro IEMIMPL_RDRAND_RDSEED 3
6498BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6499 PROLOGUE_2_ARGS
6500
6501 %1 %2
6502 mov [A0], %2
6503 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6504
6505 EPILOGUE_2_ARGS
6506ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6507%endmacro
6508
6509IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6510IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6511IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6512IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6513IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6514IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6515
6516
6517;;
6518; sha1rnds4 xmm1, xmm2, imm8.
6519;
6520; @param 1 The instruction name.
6521;
6522; @param A0 Pointer to the first media register size operand (input/output).
6523; @param A1 Pointer to the second source media register size operand (input).
6524; @param A2 The 8-bit immediate
6525;
6526BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6527 PROLOGUE_3_ARGS
6528 IEMIMPL_SSE_PROLOGUE
6529
6530 movdqu xmm0, [A0]
6531 movdqu xmm1, [A1]
6532 lea T1, [.imm0 xWrtRIP]
6533 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6534 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6535 %else
6536 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6537 %endif
6538 lea T1, [T1 + T0*2]
6539 IBT_NOTRACK
6540 call T1
6541 movdqu [A0], xmm0
6542
6543 IEMIMPL_SSE_EPILOGUE
6544 EPILOGUE_3_ARGS
6545 %assign bImm 0
6546 %rep 256
6547.imm %+ bImm:
6548 IBT_ENDBRxx_WITHOUT_NOTRACK
6549 sha1rnds4 xmm0, xmm1, bImm
6550 ret
6551 %assign bImm bImm + 1
6552 %endrep
6553.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6554ENDPROC iemAImpl_sha1rnds4_u128
6555
6556
6557;;
6558; sha256rnds2 xmm1, xmm2, <XMM0>.
6559;
6560; @param 1 The instruction name.
6561;
6562; @param A0 Pointer to the first media register size operand (input/output).
6563; @param A1 Pointer to the second source media register size operand (input).
6564; @param A2 Pointer to the implicit XMM0 constants (input).
6565;
6566BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6567 PROLOGUE_3_ARGS
6568 IEMIMPL_SSE_PROLOGUE
6569
6570 movdqu xmm0, [A2]
6571 movdqu xmm1, [A0]
6572 movdqu xmm2, [A1]
6573 sha256rnds2 xmm1, xmm2
6574 movdqu [A0], xmm1
6575
6576 IEMIMPL_SSE_EPILOGUE
6577 EPILOGUE_3_ARGS
6578ENDPROC iemAImpl_sha256rnds2_u128
6579
6580
6581;
6582; 32-bit forms of ADCX and ADOX
6583;
6584; @param A0 Pointer to the destination operand (input/output).
6585; @param A1 Pointer to the EFLAGS value (input/output).
6586; @param A2 32-bit source operand 1 (input).
6587;
6588%macro IEMIMPL_ADX_32 2
6589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6590 PROLOGUE_4_ARGS
6591
6592 IEM_LOAD_FLAGS A1, %2, 0
6593 %1 A2_32, [A0]
6594 mov [A0], A2_32
6595 IEM_SAVE_FLAGS A1, %2, 0
6596
6597 EPILOGUE_4_ARGS
6598ENDPROC iemAImpl_ %+ %1 %+ _u32
6599%endmacro
6600
6601;
6602; 64-bit forms of ADCX and ADOX
6603;
6604; @param A0 Pointer to the destination operand (input/output).
6605; @param A1 Pointer to the EFLAGS value (input/output).
6606; @param A2 64-bit source operand 1 (input).
6607;
6608%macro IEMIMPL_ADX_64 2
6609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6610 PROLOGUE_4_ARGS
6611
6612 IEM_LOAD_FLAGS A1, %2, 0
6613 %1 A2, [A0]
6614 mov [A0], A2
6615 IEM_SAVE_FLAGS A1, %2, 0
6616
6617 EPILOGUE_4_ARGS
6618ENDPROC iemAImpl_ %+ %1 %+ _u64
6619%endmacro
6620
6621IEMIMPL_ADX_32 adcx, X86_EFL_CF
6622IEMIMPL_ADX_64 adcx, X86_EFL_CF
6623
6624IEMIMPL_ADX_32 adox, X86_EFL_OF
6625IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette