VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 96109

Last change on this file since 96109 was 96109, checked in by vboxsync, 2 years ago

VMM/IEM: Implement [v]unpck{l,h}p{s,d} instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 123.0 KB
Line 
1; $Id: IEMAllAImpl.asm 96109 2022-08-08 11:41:33Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle Corporation
8;
9; This file is part of VirtualBox Open Source Edition (OSE), as
10; available from http://www.virtualbox.org. This file is free software;
11; you can redistribute it and/or modify it under the terms of the GNU
12; General Public License (GPL) as published by the Free Software
13; Foundation, in version 2 as it comes in the "COPYING" file of the
14; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16;
17
18
19;*********************************************************************************************************************************
20;* Header Files *
21;*********************************************************************************************************************************
22%include "VBox/asmdefs.mac"
23%include "VBox/err.mac"
24%include "iprt/x86.mac"
25
26
27;*********************************************************************************************************************************
28;* Defined Constants And Macros *
29;*********************************************************************************************************************************
30
31;;
32; RET XX / RET wrapper for fastcall.
33;
34%macro RET_FASTCALL 1
35%ifdef RT_ARCH_X86
36 %ifdef RT_OS_WINDOWS
37 ret %1
38 %else
39 ret
40 %endif
41%else
42 ret
43%endif
44%endmacro
45
46;;
47; NAME for fastcall functions.
48;
49;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50; escaping (or whatever the dollar is good for here). Thus the ugly
51; prefix argument.
52;
53%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54%ifdef RT_ARCH_X86
55 %ifdef RT_OS_WINDOWS
56 %undef NAME_FASTCALL
57 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58 %endif
59%endif
60
61;;
62; BEGINPROC for fastcall functions.
63;
64; @param 1 The function name (C).
65; @param 2 The argument size on x86.
66;
67%macro BEGINPROC_FASTCALL 2
68 %ifdef ASM_FORMAT_PE
69 export %1=NAME_FASTCALL(%1,%2,$@)
70 %endif
71 %ifdef __NASM__
72 %ifdef ASM_FORMAT_OMF
73 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74 %endif
75 %endif
76 %ifndef ASM_FORMAT_BIN
77 global NAME_FASTCALL(%1,%2,$@)
78 %endif
79NAME_FASTCALL(%1,%2,@):
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %endif
143
144 %ifdef ASM_CALL64_MSC
145 %define A0 rcx
146 %define A0_32 ecx
147 %define A0_16 cx
148 %define A0_8 cl
149
150 %define A1 rdx
151 %define A1_32 edx
152 %define A1_16 dx
153 %define A1_8 dl
154
155 %define A2 r8
156 %define A2_32 r8d
157 %define A2_16 r8w
158 %define A2_8 r8b
159
160 %define A3 r9
161 %define A3_32 r9d
162 %define A3_16 r9w
163 %endif
164
165 %define T0 rax
166 %define T0_32 eax
167 %define T0_16 ax
168 %define T0_8 al
169
170 %define T1 r11
171 %define T1_32 r11d
172 %define T1_16 r11w
173 %define T1_8 r11b
174
175 %define T2 r10 ; only AMD64
176 %define T2_32 r10d
177 %define T2_16 r10w
178 %define T2_8 r10b
179
180%else
181 ; x86
182 %macro PROLOGUE_1_ARGS 0
183 push edi
184 %endmacro
185 %macro EPILOGUE_1_ARGS 0
186 pop edi
187 ret 0
188 %endmacro
189 %macro EPILOGUE_1_ARGS_EX 1
190 pop edi
191 ret %1
192 %endmacro
193
194 %macro PROLOGUE_2_ARGS 0
195 push edi
196 %endmacro
197 %macro EPILOGUE_2_ARGS 0
198 pop edi
199 ret 0
200 %endmacro
201 %macro EPILOGUE_2_ARGS_EX 1
202 pop edi
203 ret %1
204 %endmacro
205
206 %macro PROLOGUE_3_ARGS 0
207 push ebx
208 mov ebx, [esp + 4 + 4]
209 push edi
210 %endmacro
211 %macro EPILOGUE_3_ARGS_EX 1
212 %if (%1) < 4
213 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214 %endif
215 pop edi
216 pop ebx
217 ret %1
218 %endmacro
219 %macro EPILOGUE_3_ARGS 0
220 EPILOGUE_3_ARGS_EX 4
221 %endmacro
222
223 %macro PROLOGUE_4_ARGS 0
224 push ebx
225 push edi
226 push esi
227 mov ebx, [esp + 12 + 4 + 0]
228 mov esi, [esp + 12 + 4 + 4]
229 %endmacro
230 %macro EPILOGUE_4_ARGS_EX 1
231 %if (%1) < 8
232 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233 %endif
234 pop esi
235 pop edi
236 pop ebx
237 ret %1
238 %endmacro
239 %macro EPILOGUE_4_ARGS 0
240 EPILOGUE_4_ARGS_EX 8
241 %endmacro
242
243 %define A0 ecx
244 %define A0_32 ecx
245 %define A0_16 cx
246 %define A0_8 cl
247
248 %define A1 edx
249 %define A1_32 edx
250 %define A1_16 dx
251 %define A1_8 dl
252
253 %define A2 ebx
254 %define A2_32 ebx
255 %define A2_16 bx
256 %define A2_8 bl
257
258 %define A3 esi
259 %define A3_32 esi
260 %define A3_16 si
261
262 %define T0 eax
263 %define T0_32 eax
264 %define T0_16 ax
265 %define T0_8 al
266
267 %define T1 edi
268 %define T1_32 edi
269 %define T1_16 di
270%endif
271
272
273;;
274; Load the relevant flags from [%1] if there are undefined flags (%3).
275;
276; @remarks Clobbers T0, stack. Changes EFLAGS.
277; @param A2 The register pointing to the flags.
278; @param 1 The parameter (A0..A3) pointing to the eflags.
279; @param 2 The set of modified flags.
280; @param 3 The set of undefined flags.
281;
282%macro IEM_MAYBE_LOAD_FLAGS 3
283 ;%if (%3) != 0
284 pushf ; store current flags
285 mov T0_32, [%1] ; load the guest flags
286 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
287 and T0_32, (%2 | %3) ; select the modified and undefined flags.
288 or [xSP], T0 ; merge guest flags with host flags.
289 popf ; load the mixed flags.
290 ;%endif
291%endmacro
292
293;;
294; Update the flag.
295;
296; @remarks Clobbers T0, T1, stack.
297; @param 1 The register pointing to the EFLAGS.
298; @param 2 The mask of modified flags to save.
299; @param 3 The mask of undefined flags to (maybe) save.
300;
301%macro IEM_SAVE_FLAGS 3
302 %if (%2 | %3) != 0
303 pushf
304 pop T1
305 mov T0_32, [%1] ; flags
306 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
307 and T1_32, (%2 | %3) ; select the modified and undefined flags.
308 or T0_32, T1_32 ; combine the flags.
309 mov [%1], T0_32 ; save the flags.
310 %endif
311%endmacro
312
313;;
314; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 Mask of additional flags to always clear
320; @param 4 Mask of additional flags to always set.
321;
322%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323 %if (%2 | %3 | %4) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; load flags.
327 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
328 and T1_32, (%2) ; select the modified flags.
329 or T0_32, T1_32 ; combine the flags.
330 %if (%4) != 0
331 or T0_32, %4 ; add the always set flags.
332 %endif
333 mov [%1], T0_32 ; save the result.
334 %endif
335%endmacro
336
337;;
338; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339; signed input (%4[%5]) and parity index (%6).
340;
341; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344;
345; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 The result register to set SF by.
350; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351; @param 6 The (full) register containing the parity table index. Will be modified!
352
353%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354 %ifdef RT_ARCH_AMD64
355 pushf
356 pop T2
357 %else
358 push T0
359 pushf
360 pop T0
361 %endif
362 mov T1_32, [%1] ; load flags.
363 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364 %ifdef RT_ARCH_AMD64
365 and T2_32, (%2) ; select the modified flags.
366 or T1_32, T2_32 ; combine the flags.
367 %else
368 and T0_32, (%2) ; select the modified flags.
369 or T1_32, T0_32 ; combine the flags.
370 pop T0
371 %endif
372
373 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
374 bt %4, %5 - 1
375 jnc %%sf_clear
376 or T1_32, X86_EFL_SF
377 %%sf_clear:
378
379 ; Parity last.
380 and %6, 0xff
381 %ifdef RT_ARCH_AMD64
382 lea T2, [NAME(g_afParity) xWrtRIP]
383 or T1_8, [T2 + %6]
384 %else
385 or T1_8, [NAME(g_afParity) + %6]
386 %endif
387
388 mov [%1], T1_32 ; save the result.
389%endmacro
390
391;;
392; Calculates the new EFLAGS using fixed clear and set bit masks.
393;
394; @remarks Clobbers T0.
395; @param 1 The register pointing to the EFLAGS.
396; @param 2 Mask of additional flags to always clear
397; @param 3 Mask of additional flags to always set.
398;
399%macro IEM_ADJUST_FLAGS 3
400 %if (%2 | %3) != 0
401 mov T0_32, [%1] ; Load flags.
402 %if (%2) != 0
403 and T0_32, ~(%2) ; Remove the always cleared flags.
404 %endif
405 %if (%3) != 0
406 or T0_32, %3 ; Add the always set flags.
407 %endif
408 mov [%1], T0_32 ; Save the result.
409 %endif
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0, %4, EFLAGS.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419; @param 4 The (full) register containing the parity table index. Will be modified!
420;
421%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422 mov T0_32, [%1] ; Load flags.
423 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
424 %if (%3) != 0
425 or T0_32, %3 ; Add the always set flags.
426 %endif
427 and %4, 0xff
428 %ifdef RT_ARCH_AMD64
429 lea T2, [NAME(g_afParity) xWrtRIP]
430 or T0_8, [T2 + %4]
431 %else
432 or T0_8, [NAME(g_afParity) + %4]
433 %endif
434 mov [%1], T0_32 ; Save the result.
435%endmacro
436
437
438;*********************************************************************************************************************************
439;* External Symbols *
440;*********************************************************************************************************************************
441extern NAME(g_afParity)
442
443
444;;
445; Macro for implementing a binary operator.
446;
447; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448; variants, except on 32-bit system where the 64-bit accesses requires hand
449; coding.
450;
451; All the functions takes a pointer to the destination memory operand in A0,
452; the source register operand in A1 and a pointer to eflags in A2.
453;
454; @param 1 The instruction mnemonic.
455; @param 2 Non-zero if there should be a locked version.
456; @param 3 The modified flags.
457; @param 4 The undefined flags.
458;
459%macro IEMIMPL_BIN_OP 4
460BEGINCODE
461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462 PROLOGUE_3_ARGS
463 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464 %1 byte [A0], A1_8
465 IEM_SAVE_FLAGS A2, %3, %4
466 EPILOGUE_3_ARGS
467ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470 PROLOGUE_3_ARGS
471 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472 %1 word [A0], A1_16
473 IEM_SAVE_FLAGS A2, %3, %4
474 EPILOGUE_3_ARGS
475ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478 PROLOGUE_3_ARGS
479 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480 %1 dword [A0], A1_32
481 IEM_SAVE_FLAGS A2, %3, %4
482 EPILOGUE_3_ARGS
483ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485 %ifdef RT_ARCH_AMD64
486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487 PROLOGUE_3_ARGS
488 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489 %1 qword [A0], A1
490 IEM_SAVE_FLAGS A2, %3, %4
491 EPILOGUE_3_ARGS_EX 8
492ENDPROC iemAImpl_ %+ %1 %+ _u64
493 %endif ; RT_ARCH_AMD64
494
495 %if %2 != 0 ; locked versions requested?
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 lock %1 byte [A0], A1_8
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 lock %1 word [A0], A1_16
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514 PROLOGUE_3_ARGS
515 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516 lock %1 dword [A0], A1_32
517 IEM_SAVE_FLAGS A2, %3, %4
518 EPILOGUE_3_ARGS
519ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521 %ifdef RT_ARCH_AMD64
522BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523 PROLOGUE_3_ARGS
524 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525 lock %1 qword [A0], A1
526 IEM_SAVE_FLAGS A2, %3, %4
527 EPILOGUE_3_ARGS_EX 8
528ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529 %endif ; RT_ARCH_AMD64
530 %endif ; locked
531%endmacro
532
533; instr,lock, modified-flags, undefined flags
534IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
535IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
536IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
537IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
538IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
539IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
540IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
541IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
542IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
543
544
545;;
546; Macro for implementing a binary operator, VEX variant with separate input/output.
547;
548; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549; where the 64-bit accesses requires hand coding.
550;
551; All the functions takes a pointer to the destination memory operand in A0,
552; the first source register operand in A1, the second source register operand
553; in A2 and a pointer to eflags in A3.
554;
555; @param 1 The instruction mnemonic.
556; @param 2 The modified flags.
557; @param 3 The undefined flags.
558;
559%macro IEMIMPL_VEX_BIN_OP 3
560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561 PROLOGUE_4_ARGS
562 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563 %1 T0_32, A1_32, A2_32
564 mov [A0], T0_32
565 IEM_SAVE_FLAGS A3, %2, %3
566 EPILOGUE_4_ARGS
567ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569 %ifdef RT_ARCH_AMD64
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0, A1, A2
574 mov [A0], T0
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u64
578 %endif ; RT_ARCH_AMD64
579%endmacro
580
581; instr, modified-flags, undefined-flags
582IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
583IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
584IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
585
586;;
587; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
588;
589; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
590; where the 64-bit accesses requires hand coding.
591;
592; All the functions takes a pointer to the destination memory operand in A0,
593; the source register operand in A1 and a pointer to eflags in A2.
594;
595; @param 1 The instruction mnemonic.
596; @param 2 The modified flags.
597; @param 3 The undefined flags.
598;
599%macro IEMIMPL_VEX_BIN_OP_2 3
600BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
601 PROLOGUE_4_ARGS
602 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
603 mov T0_32, [A0]
604 %1 T0_32, A1_32
605 mov [A0], T0_32
606 IEM_SAVE_FLAGS A2, %2, %3
607 EPILOGUE_4_ARGS
608ENDPROC iemAImpl_ %+ %1 %+ _u32
609
610 %ifdef RT_ARCH_AMD64
611BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
612 PROLOGUE_4_ARGS
613 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
614 mov T0, [A0]
615 %1 T0, A1
616 mov [A0], T0
617 IEM_SAVE_FLAGS A2, %2, %3
618 EPILOGUE_4_ARGS
619ENDPROC iemAImpl_ %+ %1 %+ _u64
620 %endif ; RT_ARCH_AMD64
621%endmacro
622
623; instr, modified-flags, undefined-flags
624IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
625IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
626IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
627
628
629;;
630; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
631;
632; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
633; where the 64-bit accesses requires hand coding.
634;
635; All the functions takes a pointer to the destination memory operand in A0,
636; the first source register operand in A1, the second source register operand
637; in A2 and a pointer to eflags in A3.
638;
639; @param 1 The instruction mnemonic.
640; @param 2 Fallback instruction if applicable.
641; @param 3 Whether to emit fallback or not.
642;
643%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
645 PROLOGUE_3_ARGS
646 %1 T0_32, A1_32, A2_32
647 mov [A0], T0_32
648 EPILOGUE_3_ARGS
649ENDPROC iemAImpl_ %+ %1 %+ _u32
650
651 %if %3
652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
653 PROLOGUE_3_ARGS
654 %ifdef ASM_CALL64_GCC
655 mov cl, A2_8
656 %2 A1_32, cl
657 mov [A0], A1_32
658 %else
659 xchg A2, A0
660 %2 A1_32, cl
661 mov [A2], A1_32
662 %endif
663 EPILOGUE_3_ARGS
664ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
665 %endif
666
667 %ifdef RT_ARCH_AMD64
668BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
669 PROLOGUE_3_ARGS
670 %1 T0, A1, A2
671 mov [A0], T0
672 EPILOGUE_3_ARGS
673ENDPROC iemAImpl_ %+ %1 %+ _u64
674
675 %if %3
676BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
677 PROLOGUE_3_ARGS
678 %ifdef ASM_CALL64_GCC
679 mov cl, A2_8
680 %2 A1, cl
681 mov [A0], A1_32
682 %else
683 xchg A2, A0
684 %2 A1, cl
685 mov [A2], A1_32
686 %endif
687 mov [A0], A1
688 EPILOGUE_3_ARGS
689ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
690 %endif
691 %endif ; RT_ARCH_AMD64
692%endmacro
693
694; instr, fallback instr, emit fallback
695IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
696IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
697IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
698IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
699IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
700
701
702;
703; RORX uses a immediate byte for the shift count, so we only do
704; fallback implementation of that one.
705;
706BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
707 PROLOGUE_3_ARGS
708 %ifdef ASM_CALL64_GCC
709 mov cl, A2_8
710 ror A1_32, cl
711 mov [A0], A1_32
712 %else
713 xchg A2, A0
714 ror A1_32, cl
715 mov [A2], A1_32
716 %endif
717 EPILOGUE_3_ARGS
718ENDPROC iemAImpl_rorx_u32
719
720 %ifdef RT_ARCH_AMD64
721BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
722 PROLOGUE_3_ARGS
723 %ifdef ASM_CALL64_GCC
724 mov cl, A2_8
725 ror A1, cl
726 mov [A0], A1_32
727 %else
728 xchg A2, A0
729 ror A1, cl
730 mov [A2], A1_32
731 %endif
732 mov [A0], A1
733 EPILOGUE_3_ARGS
734ENDPROC iemAImpl_rorx_u64
735 %endif ; RT_ARCH_AMD64
736
737
738;
739; MULX
740;
741BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
742 PROLOGUE_4_ARGS
743%ifdef ASM_CALL64_GCC
744 ; A2_32 is EDX - prefect
745 mulx T0_32, T1_32, A3_32
746 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
747 mov [A0], T0_32
748%else
749 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
750 xchg A1, A2
751 mulx T0_32, T1_32, A3_32
752 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
753 mov [A0], T0_32
754%endif
755 EPILOGUE_4_ARGS
756ENDPROC iemAImpl_mulx_u32
757
758
759BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
760 PROLOGUE_4_ARGS
761%ifdef ASM_CALL64_GCC
762 ; A2_32 is EDX, T0_32 is EAX
763 mov eax, A3_32
764 mul A2_32
765 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
766 mov [A0], edx
767%else
768 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
769 xchg A1, A2
770 mov eax, A3_32
771 mul A2_32
772 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
773 mov [A0], edx
774%endif
775 EPILOGUE_4_ARGS
776ENDPROC iemAImpl_mulx_u32_fallback
777
778%ifdef RT_ARCH_AMD64
779BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2 is RDX - prefect
783 mulx T0, T1, A3
784 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0
786%else
787 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
788 xchg A1, A2
789 mulx T0, T1, A3
790 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u64
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2 is RDX, T0 is RAX
801 mov rax, A3
802 mul A2
803 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], rdx
805%else
806 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
807 xchg A1, A2
808 mov rax, A3
809 mul A2
810 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], rdx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u64_fallback
815
816%endif
817
818
819;;
820; Macro for implementing a bit operator.
821;
822; This will generate code for the 16, 32 and 64 bit accesses with locked
823; variants, except on 32-bit system where the 64-bit accesses requires hand
824; coding.
825;
826; All the functions takes a pointer to the destination memory operand in A0,
827; the source register operand in A1 and a pointer to eflags in A2.
828;
829; @param 1 The instruction mnemonic.
830; @param 2 Non-zero if there should be a locked version.
831; @param 3 The modified flags.
832; @param 4 The undefined flags.
833;
834%macro IEMIMPL_BIT_OP 4
835BEGINCODE
836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
837 PROLOGUE_3_ARGS
838 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
839 %1 word [A0], A1_16
840 IEM_SAVE_FLAGS A2, %3, %4
841 EPILOGUE_3_ARGS
842ENDPROC iemAImpl_ %+ %1 %+ _u16
843
844BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
845 PROLOGUE_3_ARGS
846 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
847 %1 dword [A0], A1_32
848 IEM_SAVE_FLAGS A2, %3, %4
849 EPILOGUE_3_ARGS
850ENDPROC iemAImpl_ %+ %1 %+ _u32
851
852 %ifdef RT_ARCH_AMD64
853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
854 PROLOGUE_3_ARGS
855 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
856 %1 qword [A0], A1
857 IEM_SAVE_FLAGS A2, %3, %4
858 EPILOGUE_3_ARGS_EX 8
859ENDPROC iemAImpl_ %+ %1 %+ _u64
860 %endif ; RT_ARCH_AMD64
861
862 %if %2 != 0 ; locked versions requested?
863
864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
865 PROLOGUE_3_ARGS
866 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
867 lock %1 word [A0], A1_16
868 IEM_SAVE_FLAGS A2, %3, %4
869 EPILOGUE_3_ARGS
870ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
871
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 lock %1 dword [A0], A1_32
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
879
880 %ifdef RT_ARCH_AMD64
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 lock %1 qword [A0], A1
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS_EX 8
887ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
888 %endif ; RT_ARCH_AMD64
889 %endif ; locked
890%endmacro
891IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
892IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
893IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
894IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
895
896;;
897; Macro for implementing a bit search operator.
898;
899; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
900; system where the 64-bit accesses requires hand coding.
901;
902; All the functions takes a pointer to the destination memory operand in A0,
903; the source register operand in A1 and a pointer to eflags in A2.
904;
905; In the ZF case the destination register is 'undefined', however it seems that
906; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
907; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
908; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
909; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
910;
911; @param 1 The instruction mnemonic.
912; @param 2 The modified flags.
913; @param 3 The undefined flags.
914; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
915;
916%macro IEMIMPL_BIT_OP2 4
917BEGINCODE
918BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
919 PROLOGUE_3_ARGS
920 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
921 %1 T0_16, A1_16
922%if %4 != 0
923 jz .unchanged_dst
924%endif
925 mov [A0], T0_16
926.unchanged_dst:
927 IEM_SAVE_FLAGS A2, %2, %3
928 EPILOGUE_3_ARGS
929ENDPROC iemAImpl_ %+ %1 %+ _u16
930
931BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
932 PROLOGUE_3_ARGS
933 %1 T1_16, A1_16
934%if %4 != 0
935 jz .unchanged_dst
936%endif
937 mov [A0], T1_16
938 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
939 EPILOGUE_3_ARGS
940.unchanged_dst:
941 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
942 EPILOGUE_3_ARGS
943ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
944
945BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
946 PROLOGUE_3_ARGS
947 %1 T0_16, A1_16
948%if %4 != 0
949 jz .unchanged_dst
950%endif
951 mov [A0], T0_16
952.unchanged_dst:
953 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
954 EPILOGUE_3_ARGS
955ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
956
957
958BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
959 PROLOGUE_3_ARGS
960 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
961 %1 T0_32, A1_32
962%if %4 != 0
963 jz .unchanged_dst
964%endif
965 mov [A0], T0_32
966.unchanged_dst:
967 IEM_SAVE_FLAGS A2, %2, %3
968 EPILOGUE_3_ARGS
969ENDPROC iemAImpl_ %+ %1 %+ _u32
970
971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
972 PROLOGUE_3_ARGS
973 %1 T1_32, A1_32
974%if %4 != 0
975 jz .unchanged_dst
976%endif
977 mov [A0], T1_32
978 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
979 EPILOGUE_3_ARGS
980.unchanged_dst:
981 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
982 EPILOGUE_3_ARGS
983ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
984
985BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
986 PROLOGUE_3_ARGS
987 %1 T0_32, A1_32
988%if %4 != 0
989 jz .unchanged_dst
990%endif
991 mov [A0], T0_32
992.unchanged_dst:
993 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
996
997
998 %ifdef RT_ARCH_AMD64
999
1000BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1001 PROLOGUE_3_ARGS
1002 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1003 %1 T0, A1
1004%if %4 != 0
1005 jz .unchanged_dst
1006%endif
1007 mov [A0], T0
1008.unchanged_dst:
1009 IEM_SAVE_FLAGS A2, %2, %3
1010 EPILOGUE_3_ARGS_EX 8
1011ENDPROC iemAImpl_ %+ %1 %+ _u64
1012
1013BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1014 PROLOGUE_3_ARGS
1015 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1016 %1 T1, A1
1017%if %4 != 0
1018 jz .unchanged_dst
1019%endif
1020 mov [A0], T1
1021 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1022 EPILOGUE_3_ARGS
1023.unchanged_dst:
1024 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1025 EPILOGUE_3_ARGS
1026ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1027
1028BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1029 PROLOGUE_3_ARGS
1030 %1 T0, A1
1031%if %4 != 0
1032 jz .unchanged_dst
1033%endif
1034 mov [A0], T0
1035.unchanged_dst:
1036 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1037 EPILOGUE_3_ARGS_EX 8
1038ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1039
1040 %endif ; RT_ARCH_AMD64
1041%endmacro
1042
1043IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1044IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1045IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1046IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1047
1048
1049;;
1050; Macro for implementing POPCNT.
1051;
1052; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1053; system where the 64-bit accesses requires hand coding.
1054;
1055; All the functions takes a pointer to the destination memory operand in A0,
1056; the source register operand in A1 and a pointer to eflags in A2.
1057;
1058; ASSUMES Intel and AMD set EFLAGS the same way.
1059;
1060; ASSUMES the instruction does not support memory destination.
1061;
1062; @param 1 The instruction mnemonic.
1063; @param 2 The modified flags.
1064; @param 3 The undefined flags.
1065;
1066%macro IEMIMPL_BIT_OP3 3
1067BEGINCODE
1068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1069 PROLOGUE_3_ARGS
1070 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1071 %1 T0_16, A1_16
1072 mov [A0], T0_16
1073 IEM_SAVE_FLAGS A2, %2, %3
1074 EPILOGUE_3_ARGS
1075ENDPROC iemAImpl_ %+ %1 %+ _u16
1076
1077BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1078 PROLOGUE_3_ARGS
1079 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1080 %1 T0_32, A1_32
1081 mov [A0], T0_32
1082 IEM_SAVE_FLAGS A2, %2, %3
1083 EPILOGUE_3_ARGS
1084ENDPROC iemAImpl_ %+ %1 %+ _u32
1085
1086 %ifdef RT_ARCH_AMD64
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0, A1
1091 mov [A0], T0
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS_EX 8
1094ENDPROC iemAImpl_ %+ %1 %+ _u64
1095 %endif ; RT_ARCH_AMD64
1096%endmacro
1097IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1098
1099
1100;
1101; IMUL is also a similar but yet different case (no lock, no mem dst).
1102; The rDX:rAX variant of imul is handled together with mul further down.
1103;
1104BEGINCODE
1105; @param 1 EFLAGS that are modified.
1106; @param 2 Undefined EFLAGS.
1107; @param 3 Function suffix.
1108; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1109; 2 for AMD (set AF, clear PF, ZF and SF).
1110%macro IEMIMPL_IMUL_TWO 4
1111BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1112 PROLOGUE_3_ARGS
1113 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1114 imul A1_16, word [A0]
1115 mov [A0], A1_16
1116 %if %4 != 1
1117 IEM_SAVE_FLAGS A2, %1, %2
1118 %else
1119 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1120 %endif
1121 EPILOGUE_3_ARGS
1122ENDPROC iemAImpl_imul_two_u16 %+ %3
1123
1124BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1125 PROLOGUE_3_ARGS
1126 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1127 imul A1_32, dword [A0]
1128 mov [A0], A1_32
1129 %if %4 != 1
1130 IEM_SAVE_FLAGS A2, %1, %2
1131 %else
1132 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1133 %endif
1134 EPILOGUE_3_ARGS
1135ENDPROC iemAImpl_imul_two_u32 %+ %3
1136
1137 %ifdef RT_ARCH_AMD64
1138BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1139 PROLOGUE_3_ARGS
1140 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1141 imul A1, qword [A0]
1142 mov [A0], A1
1143 %if %4 != 1
1144 IEM_SAVE_FLAGS A2, %1, %2
1145 %else
1146 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1147 %endif
1148 EPILOGUE_3_ARGS_EX 8
1149ENDPROC iemAImpl_imul_two_u64 %+ %3
1150 %endif ; RT_ARCH_AMD64
1151%endmacro
1152IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1153IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1154IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1155
1156
1157;
1158; XCHG for memory operands. This implies locking. No flag changes.
1159;
1160; Each function takes two arguments, first the pointer to the memory,
1161; then the pointer to the register. They all return void.
1162;
1163BEGINCODE
1164BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1165 PROLOGUE_2_ARGS
1166 mov T0_8, [A1]
1167 xchg [A0], T0_8
1168 mov [A1], T0_8
1169 EPILOGUE_2_ARGS
1170ENDPROC iemAImpl_xchg_u8_locked
1171
1172BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1173 PROLOGUE_2_ARGS
1174 mov T0_16, [A1]
1175 xchg [A0], T0_16
1176 mov [A1], T0_16
1177 EPILOGUE_2_ARGS
1178ENDPROC iemAImpl_xchg_u16_locked
1179
1180BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1181 PROLOGUE_2_ARGS
1182 mov T0_32, [A1]
1183 xchg [A0], T0_32
1184 mov [A1], T0_32
1185 EPILOGUE_2_ARGS
1186ENDPROC iemAImpl_xchg_u32_locked
1187
1188%ifdef RT_ARCH_AMD64
1189BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1190 PROLOGUE_2_ARGS
1191 mov T0, [A1]
1192 xchg [A0], T0
1193 mov [A1], T0
1194 EPILOGUE_2_ARGS
1195ENDPROC iemAImpl_xchg_u64_locked
1196%endif
1197
1198; Unlocked variants for fDisregardLock mode.
1199
1200BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_8, [A1]
1203 mov T1_8, [A0]
1204 mov [A0], T0_8
1205 mov [A1], T1_8
1206 EPILOGUE_2_ARGS
1207ENDPROC iemAImpl_xchg_u8_unlocked
1208
1209BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_16, [A1]
1212 mov T1_16, [A0]
1213 mov [A0], T0_16
1214 mov [A1], T1_16
1215 EPILOGUE_2_ARGS
1216ENDPROC iemAImpl_xchg_u16_unlocked
1217
1218BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1219 PROLOGUE_2_ARGS
1220 mov T0_32, [A1]
1221 mov T1_32, [A0]
1222 mov [A0], T0_32
1223 mov [A1], T1_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_unlocked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 mov T1, [A0]
1232 mov [A0], T0
1233 mov [A1], T1
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u64_unlocked
1236%endif
1237
1238
1239;
1240; XADD for memory operands.
1241;
1242; Each function takes three arguments, first the pointer to the
1243; memory/register, then the pointer to the register, and finally a pointer to
1244; eflags. They all return void.
1245;
1246BEGINCODE
1247BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1248 PROLOGUE_3_ARGS
1249 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1250 mov T0_8, [A1]
1251 xadd [A0], T0_8
1252 mov [A1], T0_8
1253 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1254 EPILOGUE_3_ARGS
1255ENDPROC iemAImpl_xadd_u8
1256
1257BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_16, [A1]
1261 xadd [A0], T0_16
1262 mov [A1], T0_16
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u16
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_32, [A1]
1271 xadd [A0], T0_32
1272 mov [A1], T0_32
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u32
1276
1277%ifdef RT_ARCH_AMD64
1278BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1279 PROLOGUE_3_ARGS
1280 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1281 mov T0, [A1]
1282 xadd [A0], T0
1283 mov [A1], T0
1284 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1285 EPILOGUE_3_ARGS
1286ENDPROC iemAImpl_xadd_u64
1287%endif ; RT_ARCH_AMD64
1288
1289BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1290 PROLOGUE_3_ARGS
1291 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1292 mov T0_8, [A1]
1293 lock xadd [A0], T0_8
1294 mov [A1], T0_8
1295 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1296 EPILOGUE_3_ARGS
1297ENDPROC iemAImpl_xadd_u8_locked
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_16, [A1]
1303 lock xadd [A0], T0_16
1304 mov [A1], T0_16
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u16_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_32, [A1]
1313 lock xadd [A0], T0_32
1314 mov [A1], T0_32
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u32_locked
1318
1319%ifdef RT_ARCH_AMD64
1320BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1321 PROLOGUE_3_ARGS
1322 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1323 mov T0, [A1]
1324 lock xadd [A0], T0
1325 mov [A1], T0
1326 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1327 EPILOGUE_3_ARGS
1328ENDPROC iemAImpl_xadd_u64_locked
1329%endif ; RT_ARCH_AMD64
1330
1331
1332;
1333; CMPXCHG8B.
1334;
1335; These are tricky register wise, so the code is duplicated for each calling
1336; convention.
1337;
1338; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1339;
1340; C-proto:
1341; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1342; uint32_t *pEFlags));
1343;
1344; Note! Identical to iemAImpl_cmpxchg16b.
1345;
1346BEGINCODE
1347BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1348%ifdef RT_ARCH_AMD64
1349 %ifdef ASM_CALL64_MSC
1350 push rbx
1351
1352 mov r11, rdx ; pu64EaxEdx (is also T1)
1353 mov r10, rcx ; pu64Dst
1354
1355 mov ebx, [r8]
1356 mov ecx, [r8 + 4]
1357 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1358 mov eax, [r11]
1359 mov edx, [r11 + 4]
1360
1361 lock cmpxchg8b [r10]
1362
1363 mov [r11], eax
1364 mov [r11 + 4], edx
1365 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1366
1367 pop rbx
1368 ret
1369 %else
1370 push rbx
1371
1372 mov r10, rcx ; pEFlags
1373 mov r11, rdx ; pu64EbxEcx (is also T1)
1374
1375 mov ebx, [r11]
1376 mov ecx, [r11 + 4]
1377 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1378 mov eax, [rsi]
1379 mov edx, [rsi + 4]
1380
1381 lock cmpxchg8b [rdi]
1382
1383 mov [rsi], eax
1384 mov [rsi + 4], edx
1385 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1386
1387 pop rbx
1388 ret
1389
1390 %endif
1391%else
1392 push esi
1393 push edi
1394 push ebx
1395 push ebp
1396
1397 mov edi, ecx ; pu64Dst
1398 mov esi, edx ; pu64EaxEdx
1399 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1400 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1401
1402 mov ebx, [ecx]
1403 mov ecx, [ecx + 4]
1404 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1405 mov eax, [esi]
1406 mov edx, [esi + 4]
1407
1408 lock cmpxchg8b [edi]
1409
1410 mov [esi], eax
1411 mov [esi + 4], edx
1412 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1413
1414 pop ebp
1415 pop ebx
1416 pop edi
1417 pop esi
1418 ret 8
1419%endif
1420ENDPROC iemAImpl_cmpxchg8b
1421
1422BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1423 ; Lazy bird always lock prefixes cmpxchg8b.
1424 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1425ENDPROC iemAImpl_cmpxchg8b_locked
1426
1427%ifdef RT_ARCH_AMD64
1428
1429;
1430; CMPXCHG16B.
1431;
1432; These are tricky register wise, so the code is duplicated for each calling
1433; convention.
1434;
1435; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1436;
1437; C-proto:
1438; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1439; uint32_t *pEFlags));
1440;
1441; Note! Identical to iemAImpl_cmpxchg8b.
1442;
1443BEGINCODE
1444BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1445 %ifdef ASM_CALL64_MSC
1446 push rbx
1447
1448 mov r11, rdx ; pu64RaxRdx (is also T1)
1449 mov r10, rcx ; pu64Dst
1450
1451 mov rbx, [r8]
1452 mov rcx, [r8 + 8]
1453 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1454 mov rax, [r11]
1455 mov rdx, [r11 + 8]
1456
1457 lock cmpxchg16b [r10]
1458
1459 mov [r11], rax
1460 mov [r11 + 8], rdx
1461 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1462
1463 pop rbx
1464 ret
1465 %else
1466 push rbx
1467
1468 mov r10, rcx ; pEFlags
1469 mov r11, rdx ; pu64RbxRcx (is also T1)
1470
1471 mov rbx, [r11]
1472 mov rcx, [r11 + 8]
1473 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1474 mov rax, [rsi]
1475 mov rdx, [rsi + 8]
1476
1477 lock cmpxchg16b [rdi]
1478
1479 mov [rsi], rax
1480 mov [rsi + 8], rdx
1481 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1482
1483 pop rbx
1484 ret
1485
1486 %endif
1487ENDPROC iemAImpl_cmpxchg16b
1488
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1490 ; Lazy bird always lock prefixes cmpxchg16b.
1491 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1492ENDPROC iemAImpl_cmpxchg16b_locked
1493
1494%endif ; RT_ARCH_AMD64
1495
1496
1497;
1498; CMPXCHG.
1499;
1500; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1501;
1502; C-proto:
1503; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1504;
1505BEGINCODE
1506%macro IEMIMPL_CMPXCHG 2
1507BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1508 PROLOGUE_4_ARGS
1509 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1510 mov al, [A1]
1511 %1 cmpxchg [A0], A2_8
1512 mov [A1], al
1513 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1514 EPILOGUE_4_ARGS
1515ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1516
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov ax, [A1]
1521 %1 cmpxchg [A0], A2_16
1522 mov [A1], ax
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov eax, [A1]
1531 %1 cmpxchg [A0], A2_32
1532 mov [A1], eax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1538%ifdef RT_ARCH_AMD64
1539 PROLOGUE_4_ARGS
1540 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1541 mov rax, [A1]
1542 %1 cmpxchg [A0], A2
1543 mov [A1], rax
1544 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1545 EPILOGUE_4_ARGS
1546%else
1547 ;
1548 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1549 ;
1550 push esi
1551 push edi
1552 push ebx
1553 push ebp
1554
1555 mov edi, ecx ; pu64Dst
1556 mov esi, edx ; pu64Rax
1557 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1558 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1559
1560 mov ebx, [ecx]
1561 mov ecx, [ecx + 4]
1562 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1563 mov eax, [esi]
1564 mov edx, [esi + 4]
1565
1566 lock cmpxchg8b [edi]
1567
1568 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1569 jz .cmpxchg8b_not_equal
1570 cmp eax, eax ; just set the other flags.
1571.store:
1572 mov [esi], eax
1573 mov [esi + 4], edx
1574 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1575
1576 pop ebp
1577 pop ebx
1578 pop edi
1579 pop esi
1580 ret 8
1581
1582.cmpxchg8b_not_equal:
1583 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1584 jne .store
1585 cmp [esi], eax
1586 jmp .store
1587
1588%endif
1589ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1590%endmacro ; IEMIMPL_CMPXCHG
1591
1592IEMIMPL_CMPXCHG , ,
1593IEMIMPL_CMPXCHG lock, _locked
1594
1595;;
1596; Macro for implementing a unary operator.
1597;
1598; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1599; variants, except on 32-bit system where the 64-bit accesses requires hand
1600; coding.
1601;
1602; All the functions takes a pointer to the destination memory operand in A0,
1603; the source register operand in A1 and a pointer to eflags in A2.
1604;
1605; @param 1 The instruction mnemonic.
1606; @param 2 The modified flags.
1607; @param 3 The undefined flags.
1608;
1609%macro IEMIMPL_UNARY_OP 3
1610BEGINCODE
1611BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1612 PROLOGUE_2_ARGS
1613 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1614 %1 byte [A0]
1615 IEM_SAVE_FLAGS A1, %2, %3
1616 EPILOGUE_2_ARGS
1617ENDPROC iemAImpl_ %+ %1 %+ _u8
1618
1619BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1620 PROLOGUE_2_ARGS
1621 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1622 lock %1 byte [A0]
1623 IEM_SAVE_FLAGS A1, %2, %3
1624 EPILOGUE_2_ARGS
1625ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1626
1627BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1628 PROLOGUE_2_ARGS
1629 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1630 %1 word [A0]
1631 IEM_SAVE_FLAGS A1, %2, %3
1632 EPILOGUE_2_ARGS
1633ENDPROC iemAImpl_ %+ %1 %+ _u16
1634
1635BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1636 PROLOGUE_2_ARGS
1637 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1638 lock %1 word [A0]
1639 IEM_SAVE_FLAGS A1, %2, %3
1640 EPILOGUE_2_ARGS
1641ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1642
1643BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1644 PROLOGUE_2_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1646 %1 dword [A0]
1647 IEM_SAVE_FLAGS A1, %2, %3
1648 EPILOGUE_2_ARGS
1649ENDPROC iemAImpl_ %+ %1 %+ _u32
1650
1651BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1652 PROLOGUE_2_ARGS
1653 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1654 lock %1 dword [A0]
1655 IEM_SAVE_FLAGS A1, %2, %3
1656 EPILOGUE_2_ARGS
1657ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1658
1659 %ifdef RT_ARCH_AMD64
1660BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1661 PROLOGUE_2_ARGS
1662 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1663 %1 qword [A0]
1664 IEM_SAVE_FLAGS A1, %2, %3
1665 EPILOGUE_2_ARGS
1666ENDPROC iemAImpl_ %+ %1 %+ _u64
1667
1668BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1669 PROLOGUE_2_ARGS
1670 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1671 lock %1 qword [A0]
1672 IEM_SAVE_FLAGS A1, %2, %3
1673 EPILOGUE_2_ARGS
1674ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1675 %endif ; RT_ARCH_AMD64
1676
1677%endmacro
1678
1679IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1680IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1681IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1682IEMIMPL_UNARY_OP not, 0, 0
1683
1684
1685;
1686; BSWAP. No flag changes.
1687;
1688; Each function takes one argument, pointer to the value to bswap
1689; (input/output). They all return void.
1690;
1691BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1692 PROLOGUE_1_ARGS
1693 mov T0_32, [A0] ; just in case any of the upper bits are used.
1694 db 66h
1695 bswap T0_32
1696 mov [A0], T0_32
1697 EPILOGUE_1_ARGS
1698ENDPROC iemAImpl_bswap_u16
1699
1700BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1701 PROLOGUE_1_ARGS
1702 mov T0_32, [A0]
1703 bswap T0_32
1704 mov [A0], T0_32
1705 EPILOGUE_1_ARGS
1706ENDPROC iemAImpl_bswap_u32
1707
1708BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1709%ifdef RT_ARCH_AMD64
1710 PROLOGUE_1_ARGS
1711 mov T0, [A0]
1712 bswap T0
1713 mov [A0], T0
1714 EPILOGUE_1_ARGS
1715%else
1716 PROLOGUE_1_ARGS
1717 mov T0, [A0]
1718 mov T1, [A0 + 4]
1719 bswap T0
1720 bswap T1
1721 mov [A0 + 4], T0
1722 mov [A0], T1
1723 EPILOGUE_1_ARGS
1724%endif
1725ENDPROC iemAImpl_bswap_u64
1726
1727
1728;;
1729; Macro for implementing a shift operation.
1730;
1731; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1732; 32-bit system where the 64-bit accesses requires hand coding.
1733;
1734; All the functions takes a pointer to the destination memory operand in A0,
1735; the shift count in A1 and a pointer to eflags in A2.
1736;
1737; @param 1 The instruction mnemonic.
1738; @param 2 The modified flags.
1739; @param 3 The undefined flags.
1740;
1741; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1742;
1743; @note the _intel and _amd variants are implemented in C.
1744;
1745%macro IEMIMPL_SHIFT_OP 3
1746BEGINCODE
1747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1748 PROLOGUE_3_ARGS
1749 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1750 %ifdef ASM_CALL64_GCC
1751 mov cl, A1_8
1752 %1 byte [A0], cl
1753 %else
1754 xchg A1, A0
1755 %1 byte [A1], cl
1756 %endif
1757 IEM_SAVE_FLAGS A2, %2, %3
1758 EPILOGUE_3_ARGS
1759ENDPROC iemAImpl_ %+ %1 %+ _u8
1760
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1762 PROLOGUE_3_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1764 %ifdef ASM_CALL64_GCC
1765 mov cl, A1_8
1766 %1 word [A0], cl
1767 %else
1768 xchg A1, A0
1769 %1 word [A1], cl
1770 %endif
1771 IEM_SAVE_FLAGS A2, %2, %3
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_ %+ %1 %+ _u16
1774
1775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1778 %ifdef ASM_CALL64_GCC
1779 mov cl, A1_8
1780 %1 dword [A0], cl
1781 %else
1782 xchg A1, A0
1783 %1 dword [A1], cl
1784 %endif
1785 IEM_SAVE_FLAGS A2, %2, %3
1786 EPILOGUE_3_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u32
1788
1789 %ifdef RT_ARCH_AMD64
1790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1791 PROLOGUE_3_ARGS
1792 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1793 %ifdef ASM_CALL64_GCC
1794 mov cl, A1_8
1795 %1 qword [A0], cl
1796 %else
1797 xchg A1, A0
1798 %1 qword [A1], cl
1799 %endif
1800 IEM_SAVE_FLAGS A2, %2, %3
1801 EPILOGUE_3_ARGS
1802ENDPROC iemAImpl_ %+ %1 %+ _u64
1803 %endif ; RT_ARCH_AMD64
1804
1805%endmacro
1806
1807IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1808IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1809IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1810IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1811IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1812IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1813IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1814
1815
1816;;
1817; Macro for implementing a double precision shift operation.
1818;
1819; This will generate code for the 16, 32 and 64 bit accesses, except on
1820; 32-bit system where the 64-bit accesses requires hand coding.
1821;
1822; The functions takes the destination operand (r/m) in A0, the source (reg) in
1823; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1824;
1825; @param 1 The instruction mnemonic.
1826; @param 2 The modified flags.
1827; @param 3 The undefined flags.
1828;
1829; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1830;
1831; @note the _intel and _amd variants are implemented in C.
1832;
1833%macro IEMIMPL_SHIFT_DBL_OP 3
1834BEGINCODE
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1836 PROLOGUE_4_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 xchg A3, A2
1840 %1 [A0], A1_16, cl
1841 xchg A3, A2
1842 %else
1843 xchg A0, A2
1844 %1 [A2], A1_16, cl
1845 %endif
1846 IEM_SAVE_FLAGS A3, %2, %3
1847 EPILOGUE_4_ARGS
1848ENDPROC iemAImpl_ %+ %1 %+ _u16
1849
1850BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1851 PROLOGUE_4_ARGS
1852 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1853 %ifdef ASM_CALL64_GCC
1854 xchg A3, A2
1855 %1 [A0], A1_32, cl
1856 xchg A3, A2
1857 %else
1858 xchg A0, A2
1859 %1 [A2], A1_32, cl
1860 %endif
1861 IEM_SAVE_FLAGS A3, %2, %3
1862 EPILOGUE_4_ARGS
1863ENDPROC iemAImpl_ %+ %1 %+ _u32
1864
1865 %ifdef RT_ARCH_AMD64
1866BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1867 PROLOGUE_4_ARGS
1868 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1869 %ifdef ASM_CALL64_GCC
1870 xchg A3, A2
1871 %1 [A0], A1, cl
1872 xchg A3, A2
1873 %else
1874 xchg A0, A2
1875 %1 [A2], A1, cl
1876 %endif
1877 IEM_SAVE_FLAGS A3, %2, %3
1878 EPILOGUE_4_ARGS_EX 12
1879ENDPROC iemAImpl_ %+ %1 %+ _u64
1880 %endif ; RT_ARCH_AMD64
1881
1882%endmacro
1883
1884IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1885IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1886
1887
1888;;
1889; Macro for implementing a multiplication operations.
1890;
1891; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1892; 32-bit system where the 64-bit accesses requires hand coding.
1893;
1894; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1895; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1896; pointer to eflags in A3.
1897;
1898; The functions all return 0 so the caller can be used for div/idiv as well as
1899; for the mul/imul implementation.
1900;
1901; @param 1 The instruction mnemonic.
1902; @param 2 The modified flags.
1903; @param 3 The undefined flags.
1904; @param 4 Name suffix.
1905; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1906;
1907; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1908;
1909%macro IEMIMPL_MUL_OP 5
1910BEGINCODE
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1912 PROLOGUE_3_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1914 mov al, [A0]
1915 %1 A1_8
1916 mov [A0], ax
1917 %if %5 != 1
1918 IEM_SAVE_FLAGS A2, %2, %3
1919 %else
1920 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1921 %endif
1922 xor eax, eax
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1927 PROLOGUE_4_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1929 mov ax, [A0]
1930 %ifdef ASM_CALL64_GCC
1931 %1 A2_16
1932 mov [A0], ax
1933 mov [A1], dx
1934 %else
1935 mov T1, A1
1936 %1 A2_16
1937 mov [A0], ax
1938 mov [T1], dx
1939 %endif
1940 %if %5 != 1
1941 IEM_SAVE_FLAGS A3, %2, %3
1942 %else
1943 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1944 %endif
1945 xor eax, eax
1946 EPILOGUE_4_ARGS
1947ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1948
1949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1950 PROLOGUE_4_ARGS
1951 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1952 mov eax, [A0]
1953 %ifdef ASM_CALL64_GCC
1954 %1 A2_32
1955 mov [A0], eax
1956 mov [A1], edx
1957 %else
1958 mov T1, A1
1959 %1 A2_32
1960 mov [A0], eax
1961 mov [T1], edx
1962 %endif
1963 %if %5 != 1
1964 IEM_SAVE_FLAGS A3, %2, %3
1965 %else
1966 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1967 %endif
1968 xor eax, eax
1969 EPILOGUE_4_ARGS
1970ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1971
1972 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1973BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1974 PROLOGUE_4_ARGS
1975 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1976 mov rax, [A0]
1977 %ifdef ASM_CALL64_GCC
1978 %1 A2
1979 mov [A0], rax
1980 mov [A1], rdx
1981 %else
1982 mov T1, A1
1983 %1 A2
1984 mov [A0], rax
1985 mov [T1], rdx
1986 %endif
1987 %if %5 != 1
1988 IEM_SAVE_FLAGS A3, %2, %3
1989 %else
1990 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
1991 %endif
1992 xor eax, eax
1993 EPILOGUE_4_ARGS_EX 12
1994ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1995 %endif ; !RT_ARCH_AMD64
1996
1997%endmacro
1998
1999IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2000IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2001IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2002IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2003IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2004IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2005
2006
2007BEGINCODE
2008;;
2009; Worker function for negating a 32-bit number in T1:T0
2010; @uses None (T0,T1)
2011BEGINPROC iemAImpl_negate_T0_T1_u32
2012 push 0
2013 push 0
2014 xchg T0_32, [xSP]
2015 xchg T1_32, [xSP + xCB]
2016 sub T0_32, [xSP]
2017 sbb T1_32, [xSP + xCB]
2018 add xSP, xCB*2
2019 ret
2020ENDPROC iemAImpl_negate_T0_T1_u32
2021
2022%ifdef RT_ARCH_AMD64
2023;;
2024; Worker function for negating a 64-bit number in T1:T0
2025; @uses None (T0,T1)
2026BEGINPROC iemAImpl_negate_T0_T1_u64
2027 push 0
2028 push 0
2029 xchg T0, [xSP]
2030 xchg T1, [xSP + xCB]
2031 sub T0, [xSP]
2032 sbb T1, [xSP + xCB]
2033 add xSP, xCB*2
2034 ret
2035ENDPROC iemAImpl_negate_T0_T1_u64
2036%endif
2037
2038
2039;;
2040; Macro for implementing a division operations.
2041;
2042; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2043; 32-bit system where the 64-bit accesses requires hand coding.
2044;
2045; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2046; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2047; pointer to eflags in A3.
2048;
2049; The functions all return 0 on success and -1 if a divide error should be
2050; raised by the caller.
2051;
2052; @param 1 The instruction mnemonic.
2053; @param 2 The modified flags.
2054; @param 3 The undefined flags.
2055; @param 4 1 if signed, 0 if unsigned.
2056; @param 5 Function suffix.
2057; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2058; 2 for AMD (set AF, clear PF, ZF and SF).
2059;
2060; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2061;
2062%macro IEMIMPL_DIV_OP 6
2063BEGINCODE
2064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2065 PROLOGUE_3_ARGS
2066
2067 ; div by chainsaw check.
2068 test A1_8, A1_8
2069 jz .div_zero
2070
2071 ; Overflow check - unsigned division is simple to verify, haven't
2072 ; found a simple way to check signed division yet unfortunately.
2073 %if %4 == 0
2074 cmp [A0 + 1], A1_8
2075 jae .div_overflow
2076 %else
2077 mov T0_16, [A0] ; T0 = dividend
2078 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2079 test A1_8, A1_8
2080 js .divisor_negative
2081 test T0_16, T0_16
2082 jns .both_positive
2083 neg T0_16
2084.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2085 push T0 ; Start off like unsigned below.
2086 shr T0_16, 7
2087 cmp T0_8, A1_8
2088 pop T0
2089 jb .div_no_overflow
2090 ja .div_overflow
2091 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2092 cmp T0_8, A1_8
2093 jae .div_overflow
2094 jmp .div_no_overflow
2095
2096.divisor_negative:
2097 neg A1_8
2098 test T0_16, T0_16
2099 jns .one_of_each
2100 neg T0_16
2101.both_positive: ; Same as unsigned shifted by sign indicator bit.
2102 shr T0_16, 7
2103 cmp T0_8, A1_8
2104 jae .div_overflow
2105.div_no_overflow:
2106 mov A1, T1 ; restore divisor
2107 %endif
2108
2109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2110 mov ax, [A0]
2111 %1 A1_8
2112 mov [A0], ax
2113 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2114 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2115 %else
2116 IEM_SAVE_FLAGS A2, %2, %3
2117 %endif
2118 xor eax, eax
2119
2120.return:
2121 EPILOGUE_3_ARGS
2122
2123.div_zero:
2124.div_overflow:
2125 mov eax, -1
2126 jmp .return
2127ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2128
2129BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2130 PROLOGUE_4_ARGS
2131
2132 ; div by chainsaw check.
2133 test A2_16, A2_16
2134 jz .div_zero
2135
2136 ; Overflow check - unsigned division is simple to verify, haven't
2137 ; found a simple way to check signed division yet unfortunately.
2138 %if %4 == 0
2139 cmp [A1], A2_16
2140 jae .div_overflow
2141 %else
2142 mov T0_16, [A1]
2143 shl T0_32, 16
2144 mov T0_16, [A0] ; T0 = dividend
2145 mov T1, A2 ; T1 = divisor
2146 test T1_16, T1_16
2147 js .divisor_negative
2148 test T0_32, T0_32
2149 jns .both_positive
2150 neg T0_32
2151.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2152 push T0 ; Start off like unsigned below.
2153 shr T0_32, 15
2154 cmp T0_16, T1_16
2155 pop T0
2156 jb .div_no_overflow
2157 ja .div_overflow
2158 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2159 cmp T0_16, T1_16
2160 jae .div_overflow
2161 jmp .div_no_overflow
2162
2163.divisor_negative:
2164 neg T1_16
2165 test T0_32, T0_32
2166 jns .one_of_each
2167 neg T0_32
2168.both_positive: ; Same as unsigned shifted by sign indicator bit.
2169 shr T0_32, 15
2170 cmp T0_16, T1_16
2171 jae .div_overflow
2172.div_no_overflow:
2173 %endif
2174
2175 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2176 %ifdef ASM_CALL64_GCC
2177 mov T1, A2
2178 mov ax, [A0]
2179 mov dx, [A1]
2180 %1 T1_16
2181 mov [A0], ax
2182 mov [A1], dx
2183 %else
2184 mov T1, A1
2185 mov ax, [A0]
2186 mov dx, [T1]
2187 %1 A2_16
2188 mov [A0], ax
2189 mov [T1], dx
2190 %endif
2191 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2192 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2193 %else
2194 IEM_SAVE_FLAGS A3, %2, %3
2195 %endif
2196 xor eax, eax
2197
2198.return:
2199 EPILOGUE_4_ARGS
2200
2201.div_zero:
2202.div_overflow:
2203 mov eax, -1
2204 jmp .return
2205ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2206
2207BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2208 PROLOGUE_4_ARGS
2209
2210 ; div by chainsaw check.
2211 test A2_32, A2_32
2212 jz .div_zero
2213
2214 ; Overflow check - unsigned division is simple to verify, haven't
2215 ; found a simple way to check signed division yet unfortunately.
2216 %if %4 == 0
2217 cmp [A1], A2_32
2218 jae .div_overflow
2219 %else
2220 push A2 ; save A2 so we modify it (we out of regs on x86).
2221 mov T0_32, [A0] ; T0 = dividend low
2222 mov T1_32, [A1] ; T1 = dividend high
2223 test A2_32, A2_32
2224 js .divisor_negative
2225 test T1_32, T1_32
2226 jns .both_positive
2227 call NAME(iemAImpl_negate_T0_T1_u32)
2228.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2229 push T0 ; Start off like unsigned below.
2230 shl T1_32, 1
2231 shr T0_32, 31
2232 or T1_32, T0_32
2233 cmp T1_32, A2_32
2234 pop T0
2235 jb .div_no_overflow
2236 ja .div_overflow
2237 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2238 cmp T0_32, A2_32
2239 jae .div_overflow
2240 jmp .div_no_overflow
2241
2242.divisor_negative:
2243 neg A2_32
2244 test T1_32, T1_32
2245 jns .one_of_each
2246 call NAME(iemAImpl_negate_T0_T1_u32)
2247.both_positive: ; Same as unsigned shifted by sign indicator bit.
2248 shl T1_32, 1
2249 shr T0_32, 31
2250 or T1_32, T0_32
2251 cmp T1_32, A2_32
2252 jae .div_overflow
2253.div_no_overflow:
2254 pop A2
2255 %endif
2256
2257 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2258 mov eax, [A0]
2259 %ifdef ASM_CALL64_GCC
2260 mov T1, A2
2261 mov eax, [A0]
2262 mov edx, [A1]
2263 %1 T1_32
2264 mov [A0], eax
2265 mov [A1], edx
2266 %else
2267 mov T1, A1
2268 mov eax, [A0]
2269 mov edx, [T1]
2270 %1 A2_32
2271 mov [A0], eax
2272 mov [T1], edx
2273 %endif
2274 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2275 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2276 %else
2277 IEM_SAVE_FLAGS A3, %2, %3
2278 %endif
2279 xor eax, eax
2280
2281.return:
2282 EPILOGUE_4_ARGS
2283
2284.div_overflow:
2285 %if %4 != 0
2286 pop A2
2287 %endif
2288.div_zero:
2289 mov eax, -1
2290 jmp .return
2291ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2292
2293 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2294BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2295 PROLOGUE_4_ARGS
2296
2297 test A2, A2
2298 jz .div_zero
2299 %if %4 == 0
2300 cmp [A1], A2
2301 jae .div_overflow
2302 %else
2303 push A2 ; save A2 so we modify it (we out of regs on x86).
2304 mov T0, [A0] ; T0 = dividend low
2305 mov T1, [A1] ; T1 = dividend high
2306 test A2, A2
2307 js .divisor_negative
2308 test T1, T1
2309 jns .both_positive
2310 call NAME(iemAImpl_negate_T0_T1_u64)
2311.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2312 push T0 ; Start off like unsigned below.
2313 shl T1, 1
2314 shr T0, 63
2315 or T1, T0
2316 cmp T1, A2
2317 pop T0
2318 jb .div_no_overflow
2319 ja .div_overflow
2320 mov T1, 0x7fffffffffffffff
2321 and T0, T1 ; Special case for covering (divisor - 1).
2322 cmp T0, A2
2323 jae .div_overflow
2324 jmp .div_no_overflow
2325
2326.divisor_negative:
2327 neg A2
2328 test T1, T1
2329 jns .one_of_each
2330 call NAME(iemAImpl_negate_T0_T1_u64)
2331.both_positive: ; Same as unsigned shifted by sign indicator bit.
2332 shl T1, 1
2333 shr T0, 63
2334 or T1, T0
2335 cmp T1, A2
2336 jae .div_overflow
2337.div_no_overflow:
2338 pop A2
2339 %endif
2340
2341 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2342 mov rax, [A0]
2343 %ifdef ASM_CALL64_GCC
2344 mov T1, A2
2345 mov rax, [A0]
2346 mov rdx, [A1]
2347 %1 T1
2348 mov [A0], rax
2349 mov [A1], rdx
2350 %else
2351 mov T1, A1
2352 mov rax, [A0]
2353 mov rdx, [T1]
2354 %1 A2
2355 mov [A0], rax
2356 mov [T1], rdx
2357 %endif
2358 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2359 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2360 %else
2361 IEM_SAVE_FLAGS A3, %2, %3
2362 %endif
2363 xor eax, eax
2364
2365.return:
2366 EPILOGUE_4_ARGS_EX 12
2367
2368.div_overflow:
2369 %if %4 != 0
2370 pop A2
2371 %endif
2372.div_zero:
2373 mov eax, -1
2374 jmp .return
2375ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2376 %endif ; !RT_ARCH_AMD64
2377
2378%endmacro
2379
2380IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2381IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2382IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2383IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2384IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2385IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2386
2387
2388;;
2389; Macro for implementing memory fence operation.
2390;
2391; No return value, no operands or anything.
2392;
2393; @param 1 The instruction.
2394;
2395%macro IEMIMPL_MEM_FENCE 1
2396BEGINCODE
2397BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2398 %1
2399 ret
2400ENDPROC iemAImpl_ %+ %1
2401%endmacro
2402
2403IEMIMPL_MEM_FENCE lfence
2404IEMIMPL_MEM_FENCE sfence
2405IEMIMPL_MEM_FENCE mfence
2406
2407;;
2408; Alternative for non-SSE2 host.
2409;
2410BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2411 push xAX
2412 xchg xAX, [xSP]
2413 add xSP, xCB
2414 ret
2415ENDPROC iemAImpl_alt_mem_fence
2416
2417
2418;;
2419; Initialize the FPU for the actual instruction being emulated, this means
2420; loading parts of the guest's control word and status word.
2421;
2422; @uses 24 bytes of stack. T0, T1
2423; @param 1 Expression giving the address of the FXSTATE of the guest.
2424;
2425%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2426 fnstenv [xSP]
2427
2428 ; FCW - for exception, precision and rounding control.
2429 movzx T0, word [%1 + X86FXSTATE.FCW]
2430 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2431 mov [xSP + X86FSTENV32P.FCW], T0_16
2432
2433 ; FSW - for undefined C0, C1, C2, and C3.
2434 movzx T1, word [%1 + X86FXSTATE.FSW]
2435 and T1, X86_FSW_C_MASK
2436 movzx T0, word [xSP + X86FSTENV32P.FSW]
2437 and T0, X86_FSW_TOP_MASK
2438 or T0, T1
2439 mov [xSP + X86FSTENV32P.FSW], T0_16
2440
2441 fldenv [xSP]
2442%endmacro
2443
2444
2445;;
2446; Initialize the FPU for the actual instruction being emulated, this means
2447; loading parts of the guest's control word, status word, and update the
2448; tag word for the top register if it's empty.
2449;
2450; ASSUMES actual TOP=7
2451;
2452; @uses 24 bytes of stack. T0, T1
2453; @param 1 Expression giving the address of the FXSTATE of the guest.
2454;
2455%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2456 fnstenv [xSP]
2457
2458 ; FCW - for exception, precision and rounding control.
2459 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2460 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2461 mov [xSP + X86FSTENV32P.FCW], T0_16
2462
2463 ; FSW - for undefined C0, C1, C2, and C3.
2464 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2465 and T1_32, X86_FSW_C_MASK
2466 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2467 and T0_32, X86_FSW_TOP_MASK
2468 or T0_32, T1_32
2469 mov [xSP + X86FSTENV32P.FSW], T0_16
2470
2471 ; FTW - Only for ST0 (in/out).
2472 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2473 shr T1_32, X86_FSW_TOP_SHIFT
2474 and T1_32, X86_FSW_TOP_SMASK
2475 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2476 jc %%st0_not_empty
2477 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2478%%st0_not_empty:
2479
2480 fldenv [xSP]
2481%endmacro
2482
2483
2484;;
2485; Need to move this as well somewhere better?
2486;
2487struc IEMFPURESULT
2488 .r80Result resw 5
2489 .FSW resw 1
2490endstruc
2491
2492
2493;;
2494; Need to move this as well somewhere better?
2495;
2496struc IEMFPURESULTTWO
2497 .r80Result1 resw 5
2498 .FSW resw 1
2499 .r80Result2 resw 5
2500endstruc
2501
2502
2503;
2504;---------------------- 16-bit signed integer operations ----------------------
2505;
2506
2507
2508;;
2509; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2510;
2511; @param A0 FPU context (fxsave).
2512; @param A1 Pointer to a IEMFPURESULT for the output.
2513; @param A2 Pointer to the 16-bit floating point value to convert.
2514;
2515BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2516 PROLOGUE_3_ARGS
2517 sub xSP, 20h
2518
2519 fninit
2520 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2521 fild word [A2]
2522
2523 fnstsw word [A1 + IEMFPURESULT.FSW]
2524 fnclex
2525 fstp tword [A1 + IEMFPURESULT.r80Result]
2526
2527 fninit
2528 add xSP, 20h
2529 EPILOGUE_3_ARGS
2530ENDPROC iemAImpl_fild_r80_from_i16
2531
2532
2533;;
2534; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2535;
2536; @param A0 FPU context (fxsave).
2537; @param A1 Where to return the output FSW.
2538; @param A2 Where to store the 16-bit signed integer value.
2539; @param A3 Pointer to the 80-bit value.
2540;
2541BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2542 PROLOGUE_4_ARGS
2543 sub xSP, 20h
2544
2545 fninit
2546 fld tword [A3]
2547 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2548 fistp word [A2]
2549
2550 fnstsw word [A1]
2551
2552 fninit
2553 add xSP, 20h
2554 EPILOGUE_4_ARGS
2555ENDPROC iemAImpl_fist_r80_to_i16
2556
2557
2558;;
2559; Store a 80-bit floating point value (register) as a 16-bit signed integer
2560; (memory) with truncation.
2561;
2562; @param A0 FPU context (fxsave).
2563; @param A1 Where to return the output FSW.
2564; @param A2 Where to store the 16-bit signed integer value.
2565; @param A3 Pointer to the 80-bit value.
2566;
2567BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2568 PROLOGUE_4_ARGS
2569 sub xSP, 20h
2570
2571 fninit
2572 fld tword [A3]
2573 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2574 fisttp word [A2]
2575
2576 fnstsw word [A1]
2577
2578 fninit
2579 add xSP, 20h
2580 EPILOGUE_4_ARGS
2581ENDPROC iemAImpl_fistt_r80_to_i16
2582
2583
2584;;
2585; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2586;
2587; @param 1 The instruction
2588;
2589; @param A0 FPU context (fxsave).
2590; @param A1 Pointer to a IEMFPURESULT for the output.
2591; @param A2 Pointer to the 80-bit value.
2592; @param A3 Pointer to the 16-bit value.
2593;
2594%macro IEMIMPL_FPU_R80_BY_I16 1
2595BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2596 PROLOGUE_4_ARGS
2597 sub xSP, 20h
2598
2599 fninit
2600 fld tword [A2]
2601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2602 %1 word [A3]
2603
2604 fnstsw word [A1 + IEMFPURESULT.FSW]
2605 fnclex
2606 fstp tword [A1 + IEMFPURESULT.r80Result]
2607
2608 fninit
2609 add xSP, 20h
2610 EPILOGUE_4_ARGS
2611ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2612%endmacro
2613
2614IEMIMPL_FPU_R80_BY_I16 fiadd
2615IEMIMPL_FPU_R80_BY_I16 fimul
2616IEMIMPL_FPU_R80_BY_I16 fisub
2617IEMIMPL_FPU_R80_BY_I16 fisubr
2618IEMIMPL_FPU_R80_BY_I16 fidiv
2619IEMIMPL_FPU_R80_BY_I16 fidivr
2620
2621
2622;;
2623; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2624; only returning FSW.
2625;
2626; @param 1 The instruction
2627;
2628; @param A0 FPU context (fxsave).
2629; @param A1 Where to store the output FSW.
2630; @param A2 Pointer to the 80-bit value.
2631; @param A3 Pointer to the 64-bit value.
2632;
2633%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2635 PROLOGUE_4_ARGS
2636 sub xSP, 20h
2637
2638 fninit
2639 fld tword [A2]
2640 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2641 %1 word [A3]
2642
2643 fnstsw word [A1]
2644
2645 fninit
2646 add xSP, 20h
2647 EPILOGUE_4_ARGS
2648ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2649%endmacro
2650
2651IEMIMPL_FPU_R80_BY_I16_FSW ficom
2652
2653
2654
2655;
2656;---------------------- 32-bit signed integer operations ----------------------
2657;
2658
2659
2660;;
2661; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2662;
2663; @param A0 FPU context (fxsave).
2664; @param A1 Pointer to a IEMFPURESULT for the output.
2665; @param A2 Pointer to the 32-bit floating point value to convert.
2666;
2667BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2668 PROLOGUE_3_ARGS
2669 sub xSP, 20h
2670
2671 fninit
2672 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2673 fild dword [A2]
2674
2675 fnstsw word [A1 + IEMFPURESULT.FSW]
2676 fnclex
2677 fstp tword [A1 + IEMFPURESULT.r80Result]
2678
2679 fninit
2680 add xSP, 20h
2681 EPILOGUE_3_ARGS
2682ENDPROC iemAImpl_fild_r80_from_i32
2683
2684
2685;;
2686; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2687;
2688; @param A0 FPU context (fxsave).
2689; @param A1 Where to return the output FSW.
2690; @param A2 Where to store the 32-bit signed integer value.
2691; @param A3 Pointer to the 80-bit value.
2692;
2693BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2694 PROLOGUE_4_ARGS
2695 sub xSP, 20h
2696
2697 fninit
2698 fld tword [A3]
2699 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700 fistp dword [A2]
2701
2702 fnstsw word [A1]
2703
2704 fninit
2705 add xSP, 20h
2706 EPILOGUE_4_ARGS
2707ENDPROC iemAImpl_fist_r80_to_i32
2708
2709
2710;;
2711; Store a 80-bit floating point value (register) as a 32-bit signed integer
2712; (memory) with truncation.
2713;
2714; @param A0 FPU context (fxsave).
2715; @param A1 Where to return the output FSW.
2716; @param A2 Where to store the 32-bit signed integer value.
2717; @param A3 Pointer to the 80-bit value.
2718;
2719BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2720 PROLOGUE_4_ARGS
2721 sub xSP, 20h
2722
2723 fninit
2724 fld tword [A3]
2725 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2726 fisttp dword [A2]
2727
2728 fnstsw word [A1]
2729
2730 fninit
2731 add xSP, 20h
2732 EPILOGUE_4_ARGS
2733ENDPROC iemAImpl_fistt_r80_to_i32
2734
2735
2736;;
2737; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2738;
2739; @param 1 The instruction
2740;
2741; @param A0 FPU context (fxsave).
2742; @param A1 Pointer to a IEMFPURESULT for the output.
2743; @param A2 Pointer to the 80-bit value.
2744; @param A3 Pointer to the 32-bit value.
2745;
2746%macro IEMIMPL_FPU_R80_BY_I32 1
2747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2748 PROLOGUE_4_ARGS
2749 sub xSP, 20h
2750
2751 fninit
2752 fld tword [A2]
2753 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754 %1 dword [A3]
2755
2756 fnstsw word [A1 + IEMFPURESULT.FSW]
2757 fnclex
2758 fstp tword [A1 + IEMFPURESULT.r80Result]
2759
2760 fninit
2761 add xSP, 20h
2762 EPILOGUE_4_ARGS
2763ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2764%endmacro
2765
2766IEMIMPL_FPU_R80_BY_I32 fiadd
2767IEMIMPL_FPU_R80_BY_I32 fimul
2768IEMIMPL_FPU_R80_BY_I32 fisub
2769IEMIMPL_FPU_R80_BY_I32 fisubr
2770IEMIMPL_FPU_R80_BY_I32 fidiv
2771IEMIMPL_FPU_R80_BY_I32 fidivr
2772
2773
2774;;
2775; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2776; only returning FSW.
2777;
2778; @param 1 The instruction
2779;
2780; @param A0 FPU context (fxsave).
2781; @param A1 Where to store the output FSW.
2782; @param A2 Pointer to the 80-bit value.
2783; @param A3 Pointer to the 64-bit value.
2784;
2785%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2787 PROLOGUE_4_ARGS
2788 sub xSP, 20h
2789
2790 fninit
2791 fld tword [A2]
2792 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2793 %1 dword [A3]
2794
2795 fnstsw word [A1]
2796
2797 fninit
2798 add xSP, 20h
2799 EPILOGUE_4_ARGS
2800ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2801%endmacro
2802
2803IEMIMPL_FPU_R80_BY_I32_FSW ficom
2804
2805
2806
2807;
2808;---------------------- 64-bit signed integer operations ----------------------
2809;
2810
2811
2812;;
2813; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2814;
2815; @param A0 FPU context (fxsave).
2816; @param A1 Pointer to a IEMFPURESULT for the output.
2817; @param A2 Pointer to the 64-bit floating point value to convert.
2818;
2819BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2820 PROLOGUE_3_ARGS
2821 sub xSP, 20h
2822
2823 fninit
2824 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2825 fild qword [A2]
2826
2827 fnstsw word [A1 + IEMFPURESULT.FSW]
2828 fnclex
2829 fstp tword [A1 + IEMFPURESULT.r80Result]
2830
2831 fninit
2832 add xSP, 20h
2833 EPILOGUE_3_ARGS
2834ENDPROC iemAImpl_fild_r80_from_i64
2835
2836
2837;;
2838; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2839;
2840; @param A0 FPU context (fxsave).
2841; @param A1 Where to return the output FSW.
2842; @param A2 Where to store the 64-bit signed integer value.
2843; @param A3 Pointer to the 80-bit value.
2844;
2845BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2846 PROLOGUE_4_ARGS
2847 sub xSP, 20h
2848
2849 fninit
2850 fld tword [A3]
2851 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2852 fistp qword [A2]
2853
2854 fnstsw word [A1]
2855
2856 fninit
2857 add xSP, 20h
2858 EPILOGUE_4_ARGS
2859ENDPROC iemAImpl_fist_r80_to_i64
2860
2861
2862;;
2863; Store a 80-bit floating point value (register) as a 64-bit signed integer
2864; (memory) with truncation.
2865;
2866; @param A0 FPU context (fxsave).
2867; @param A1 Where to return the output FSW.
2868; @param A2 Where to store the 64-bit signed integer value.
2869; @param A3 Pointer to the 80-bit value.
2870;
2871BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2872 PROLOGUE_4_ARGS
2873 sub xSP, 20h
2874
2875 fninit
2876 fld tword [A3]
2877 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2878 fisttp qword [A2]
2879
2880 fnstsw word [A1]
2881
2882 fninit
2883 add xSP, 20h
2884 EPILOGUE_4_ARGS
2885ENDPROC iemAImpl_fistt_r80_to_i64
2886
2887
2888
2889;
2890;---------------------- 32-bit floating point operations ----------------------
2891;
2892
2893;;
2894; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2895;
2896; @param A0 FPU context (fxsave).
2897; @param A1 Pointer to a IEMFPURESULT for the output.
2898; @param A2 Pointer to the 32-bit floating point value to convert.
2899;
2900BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2901 PROLOGUE_3_ARGS
2902 sub xSP, 20h
2903
2904 fninit
2905 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906 fld dword [A2]
2907
2908 fnstsw word [A1 + IEMFPURESULT.FSW]
2909 fnclex
2910 fstp tword [A1 + IEMFPURESULT.r80Result]
2911
2912 fninit
2913 add xSP, 20h
2914 EPILOGUE_3_ARGS
2915ENDPROC iemAImpl_fld_r80_from_r32
2916
2917
2918;;
2919; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2920;
2921; @param A0 FPU context (fxsave).
2922; @param A1 Where to return the output FSW.
2923; @param A2 Where to store the 32-bit value.
2924; @param A3 Pointer to the 80-bit value.
2925;
2926BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2927 PROLOGUE_4_ARGS
2928 sub xSP, 20h
2929
2930 fninit
2931 fld tword [A3]
2932 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2933 fst dword [A2]
2934
2935 fnstsw word [A1]
2936
2937 fninit
2938 add xSP, 20h
2939 EPILOGUE_4_ARGS
2940ENDPROC iemAImpl_fst_r80_to_r32
2941
2942
2943;;
2944; FPU instruction working on one 80-bit and one 32-bit floating point value.
2945;
2946; @param 1 The instruction
2947;
2948; @param A0 FPU context (fxsave).
2949; @param A1 Pointer to a IEMFPURESULT for the output.
2950; @param A2 Pointer to the 80-bit value.
2951; @param A3 Pointer to the 32-bit value.
2952;
2953%macro IEMIMPL_FPU_R80_BY_R32 1
2954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2955 PROLOGUE_4_ARGS
2956 sub xSP, 20h
2957
2958 fninit
2959 fld tword [A2]
2960 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2961 %1 dword [A3]
2962
2963 fnstsw word [A1 + IEMFPURESULT.FSW]
2964 fnclex
2965 fstp tword [A1 + IEMFPURESULT.r80Result]
2966
2967 fninit
2968 add xSP, 20h
2969 EPILOGUE_4_ARGS
2970ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2971%endmacro
2972
2973IEMIMPL_FPU_R80_BY_R32 fadd
2974IEMIMPL_FPU_R80_BY_R32 fmul
2975IEMIMPL_FPU_R80_BY_R32 fsub
2976IEMIMPL_FPU_R80_BY_R32 fsubr
2977IEMIMPL_FPU_R80_BY_R32 fdiv
2978IEMIMPL_FPU_R80_BY_R32 fdivr
2979
2980
2981;;
2982; FPU instruction working on one 80-bit and one 32-bit floating point value,
2983; only returning FSW.
2984;
2985; @param 1 The instruction
2986;
2987; @param A0 FPU context (fxsave).
2988; @param A1 Where to store the output FSW.
2989; @param A2 Pointer to the 80-bit value.
2990; @param A3 Pointer to the 64-bit value.
2991;
2992%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2993BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2994 PROLOGUE_4_ARGS
2995 sub xSP, 20h
2996
2997 fninit
2998 fld tword [A2]
2999 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3000 %1 dword [A3]
3001
3002 fnstsw word [A1]
3003
3004 fninit
3005 add xSP, 20h
3006 EPILOGUE_4_ARGS
3007ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3008%endmacro
3009
3010IEMIMPL_FPU_R80_BY_R32_FSW fcom
3011
3012
3013
3014;
3015;---------------------- 64-bit floating point operations ----------------------
3016;
3017
3018;;
3019; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3020;
3021; @param A0 FPU context (fxsave).
3022; @param A1 Pointer to a IEMFPURESULT for the output.
3023; @param A2 Pointer to the 64-bit floating point value to convert.
3024;
3025BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3026 PROLOGUE_3_ARGS
3027 sub xSP, 20h
3028
3029 fninit
3030 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3031 fld qword [A2]
3032
3033 fnstsw word [A1 + IEMFPURESULT.FSW]
3034 fnclex
3035 fstp tword [A1 + IEMFPURESULT.r80Result]
3036
3037 fninit
3038 add xSP, 20h
3039 EPILOGUE_3_ARGS
3040ENDPROC iemAImpl_fld_r80_from_r64
3041
3042
3043;;
3044; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3045;
3046; @param A0 FPU context (fxsave).
3047; @param A1 Where to return the output FSW.
3048; @param A2 Where to store the 64-bit value.
3049; @param A3 Pointer to the 80-bit value.
3050;
3051BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3052 PROLOGUE_4_ARGS
3053 sub xSP, 20h
3054
3055 fninit
3056 fld tword [A3]
3057 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3058 fst qword [A2]
3059
3060 fnstsw word [A1]
3061
3062 fninit
3063 add xSP, 20h
3064 EPILOGUE_4_ARGS
3065ENDPROC iemAImpl_fst_r80_to_r64
3066
3067
3068;;
3069; FPU instruction working on one 80-bit and one 64-bit floating point value.
3070;
3071; @param 1 The instruction
3072;
3073; @param A0 FPU context (fxsave).
3074; @param A1 Pointer to a IEMFPURESULT for the output.
3075; @param A2 Pointer to the 80-bit value.
3076; @param A3 Pointer to the 64-bit value.
3077;
3078%macro IEMIMPL_FPU_R80_BY_R64 1
3079BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3080 PROLOGUE_4_ARGS
3081 sub xSP, 20h
3082
3083 fninit
3084 fld tword [A2]
3085 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3086 %1 qword [A3]
3087
3088 fnstsw word [A1 + IEMFPURESULT.FSW]
3089 fnclex
3090 fstp tword [A1 + IEMFPURESULT.r80Result]
3091
3092 fninit
3093 add xSP, 20h
3094 EPILOGUE_4_ARGS
3095ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3096%endmacro
3097
3098IEMIMPL_FPU_R80_BY_R64 fadd
3099IEMIMPL_FPU_R80_BY_R64 fmul
3100IEMIMPL_FPU_R80_BY_R64 fsub
3101IEMIMPL_FPU_R80_BY_R64 fsubr
3102IEMIMPL_FPU_R80_BY_R64 fdiv
3103IEMIMPL_FPU_R80_BY_R64 fdivr
3104
3105;;
3106; FPU instruction working on one 80-bit and one 64-bit floating point value,
3107; only returning FSW.
3108;
3109; @param 1 The instruction
3110;
3111; @param A0 FPU context (fxsave).
3112; @param A1 Where to store the output FSW.
3113; @param A2 Pointer to the 80-bit value.
3114; @param A3 Pointer to the 64-bit value.
3115;
3116%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3118 PROLOGUE_4_ARGS
3119 sub xSP, 20h
3120
3121 fninit
3122 fld tword [A2]
3123 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3124 %1 qword [A3]
3125
3126 fnstsw word [A1]
3127
3128 fninit
3129 add xSP, 20h
3130 EPILOGUE_4_ARGS
3131ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3132%endmacro
3133
3134IEMIMPL_FPU_R80_BY_R64_FSW fcom
3135
3136
3137
3138;
3139;---------------------- 80-bit floating point operations ----------------------
3140;
3141
3142;;
3143; Loads a 80-bit floating point register value from memory.
3144;
3145; @param A0 FPU context (fxsave).
3146; @param A1 Pointer to a IEMFPURESULT for the output.
3147; @param A2 Pointer to the 80-bit floating point value to load.
3148;
3149BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3150 PROLOGUE_3_ARGS
3151 sub xSP, 20h
3152
3153 fninit
3154 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3155 fld tword [A2]
3156
3157 fnstsw word [A1 + IEMFPURESULT.FSW]
3158 fnclex
3159 fstp tword [A1 + IEMFPURESULT.r80Result]
3160
3161 fninit
3162 add xSP, 20h
3163 EPILOGUE_3_ARGS
3164ENDPROC iemAImpl_fld_r80_from_r80
3165
3166
3167;;
3168; Store a 80-bit floating point register to memory
3169;
3170; @param A0 FPU context (fxsave).
3171; @param A1 Where to return the output FSW.
3172; @param A2 Where to store the 80-bit value.
3173; @param A3 Pointer to the 80-bit register value.
3174;
3175BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3176 PROLOGUE_4_ARGS
3177 sub xSP, 20h
3178
3179 fninit
3180 fld tword [A3]
3181 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3182 fstp tword [A2]
3183
3184 fnstsw word [A1]
3185
3186 fninit
3187 add xSP, 20h
3188 EPILOGUE_4_ARGS
3189ENDPROC iemAImpl_fst_r80_to_r80
3190
3191
3192;;
3193; Loads an 80-bit floating point register value in BCD format from memory.
3194;
3195; @param A0 FPU context (fxsave).
3196; @param A1 Pointer to a IEMFPURESULT for the output.
3197; @param A2 Pointer to the 80-bit BCD value to load.
3198;
3199BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3200 PROLOGUE_3_ARGS
3201 sub xSP, 20h
3202
3203 fninit
3204 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3205 fbld tword [A2]
3206
3207 fnstsw word [A1 + IEMFPURESULT.FSW]
3208 fnclex
3209 fstp tword [A1 + IEMFPURESULT.r80Result]
3210
3211 fninit
3212 add xSP, 20h
3213 EPILOGUE_3_ARGS
3214ENDPROC iemAImpl_fld_r80_from_d80
3215
3216
3217;;
3218; Store a 80-bit floating point register to memory as BCD
3219;
3220; @param A0 FPU context (fxsave).
3221; @param A1 Where to return the output FSW.
3222; @param A2 Where to store the 80-bit BCD value.
3223; @param A3 Pointer to the 80-bit register value.
3224;
3225BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3226 PROLOGUE_4_ARGS
3227 sub xSP, 20h
3228
3229 fninit
3230 fld tword [A3]
3231 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3232 fbstp tword [A2]
3233
3234 fnstsw word [A1]
3235
3236 fninit
3237 add xSP, 20h
3238 EPILOGUE_4_ARGS
3239ENDPROC iemAImpl_fst_r80_to_d80
3240
3241
3242;;
3243; FPU instruction working on two 80-bit floating point values.
3244;
3245; @param 1 The instruction
3246;
3247; @param A0 FPU context (fxsave).
3248; @param A1 Pointer to a IEMFPURESULT for the output.
3249; @param A2 Pointer to the first 80-bit value (ST0)
3250; @param A3 Pointer to the second 80-bit value (STn).
3251;
3252%macro IEMIMPL_FPU_R80_BY_R80 2
3253BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3254 PROLOGUE_4_ARGS
3255 sub xSP, 20h
3256
3257 fninit
3258 fld tword [A3]
3259 fld tword [A2]
3260 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3261 %1 %2
3262
3263 fnstsw word [A1 + IEMFPURESULT.FSW]
3264 fnclex
3265 fstp tword [A1 + IEMFPURESULT.r80Result]
3266
3267 fninit
3268 add xSP, 20h
3269 EPILOGUE_4_ARGS
3270ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3271%endmacro
3272
3273IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3274IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3275IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3276IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3277IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3278IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3279IEMIMPL_FPU_R80_BY_R80 fprem, {}
3280IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3281IEMIMPL_FPU_R80_BY_R80 fscale, {}
3282
3283
3284;;
3285; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3286; storing the result in ST1 and popping the stack.
3287;
3288; @param 1 The instruction
3289;
3290; @param A0 FPU context (fxsave).
3291; @param A1 Pointer to a IEMFPURESULT for the output.
3292; @param A2 Pointer to the first 80-bit value (ST1).
3293; @param A3 Pointer to the second 80-bit value (ST0).
3294;
3295%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3296BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3297 PROLOGUE_4_ARGS
3298 sub xSP, 20h
3299
3300 fninit
3301 fld tword [A2]
3302 fld tword [A3]
3303 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3304 %1
3305
3306 fnstsw word [A1 + IEMFPURESULT.FSW]
3307 fnclex
3308 fstp tword [A1 + IEMFPURESULT.r80Result]
3309
3310 fninit
3311 add xSP, 20h
3312 EPILOGUE_4_ARGS
3313ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3314%endmacro
3315
3316IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3317IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3318IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3319
3320
3321;;
3322; FPU instruction working on two 80-bit floating point values, only
3323; returning FSW.
3324;
3325; @param 1 The instruction
3326;
3327; @param A0 FPU context (fxsave).
3328; @param A1 Pointer to a uint16_t for the resulting FSW.
3329; @param A2 Pointer to the first 80-bit value.
3330; @param A3 Pointer to the second 80-bit value.
3331;
3332%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3333BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3334 PROLOGUE_4_ARGS
3335 sub xSP, 20h
3336
3337 fninit
3338 fld tword [A3]
3339 fld tword [A2]
3340 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3341 %1 st0, st1
3342
3343 fnstsw word [A1]
3344
3345 fninit
3346 add xSP, 20h
3347 EPILOGUE_4_ARGS
3348ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3349%endmacro
3350
3351IEMIMPL_FPU_R80_BY_R80_FSW fcom
3352IEMIMPL_FPU_R80_BY_R80_FSW fucom
3353
3354
3355;;
3356; FPU instruction working on two 80-bit floating point values,
3357; returning FSW and EFLAGS (eax).
3358;
3359; @param 1 The instruction
3360;
3361; @returns EFLAGS in EAX.
3362; @param A0 FPU context (fxsave).
3363; @param A1 Pointer to a uint16_t for the resulting FSW.
3364; @param A2 Pointer to the first 80-bit value.
3365; @param A3 Pointer to the second 80-bit value.
3366;
3367%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3368BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3369 PROLOGUE_4_ARGS
3370 sub xSP, 20h
3371
3372 fninit
3373 fld tword [A3]
3374 fld tword [A2]
3375 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3376 %1 st1
3377
3378 fnstsw word [A1]
3379 pushf
3380 pop xAX
3381
3382 fninit
3383 add xSP, 20h
3384 EPILOGUE_4_ARGS
3385ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3386%endmacro
3387
3388IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3389IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3390
3391
3392;;
3393; FPU instruction working on one 80-bit floating point value.
3394;
3395; @param 1 The instruction
3396;
3397; @param A0 FPU context (fxsave).
3398; @param A1 Pointer to a IEMFPURESULT for the output.
3399; @param A2 Pointer to the 80-bit value.
3400;
3401%macro IEMIMPL_FPU_R80 1
3402BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3403 PROLOGUE_3_ARGS
3404 sub xSP, 20h
3405
3406 fninit
3407 fld tword [A2]
3408 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3409 %1
3410
3411 fnstsw word [A1 + IEMFPURESULT.FSW]
3412 fnclex
3413 fstp tword [A1 + IEMFPURESULT.r80Result]
3414
3415 fninit
3416 add xSP, 20h
3417 EPILOGUE_3_ARGS
3418ENDPROC iemAImpl_ %+ %1 %+ _r80
3419%endmacro
3420
3421IEMIMPL_FPU_R80 fchs
3422IEMIMPL_FPU_R80 fabs
3423IEMIMPL_FPU_R80 f2xm1
3424IEMIMPL_FPU_R80 fsqrt
3425IEMIMPL_FPU_R80 frndint
3426IEMIMPL_FPU_R80 fsin
3427IEMIMPL_FPU_R80 fcos
3428
3429
3430;;
3431; FPU instruction working on one 80-bit floating point value, only
3432; returning FSW.
3433;
3434; @param 1 The instruction
3435; @param 2 Non-zero to also restore FTW.
3436;
3437; @param A0 FPU context (fxsave).
3438; @param A1 Pointer to a uint16_t for the resulting FSW.
3439; @param A2 Pointer to the 80-bit value.
3440;
3441%macro IEMIMPL_FPU_R80_FSW 2
3442BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3443 PROLOGUE_3_ARGS
3444 sub xSP, 20h
3445
3446 fninit
3447 fld tword [A2]
3448%if %2 != 0
3449 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3450%else
3451 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3452%endif
3453 %1
3454
3455 fnstsw word [A1]
3456
3457 fninit
3458 add xSP, 20h
3459 EPILOGUE_3_ARGS
3460ENDPROC iemAImpl_ %+ %1 %+ _r80
3461%endmacro
3462
3463IEMIMPL_FPU_R80_FSW ftst, 0
3464IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3465
3466
3467
3468;;
3469; FPU instruction loading a 80-bit floating point constant.
3470;
3471; @param 1 The instruction
3472;
3473; @param A0 FPU context (fxsave).
3474; @param A1 Pointer to a IEMFPURESULT for the output.
3475;
3476%macro IEMIMPL_FPU_R80_CONST 1
3477BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3478 PROLOGUE_2_ARGS
3479 sub xSP, 20h
3480
3481 fninit
3482 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3483 %1
3484
3485 fnstsw word [A1 + IEMFPURESULT.FSW]
3486 fnclex
3487 fstp tword [A1 + IEMFPURESULT.r80Result]
3488
3489 fninit
3490 add xSP, 20h
3491 EPILOGUE_2_ARGS
3492ENDPROC iemAImpl_ %+ %1 %+
3493%endmacro
3494
3495IEMIMPL_FPU_R80_CONST fld1
3496IEMIMPL_FPU_R80_CONST fldl2t
3497IEMIMPL_FPU_R80_CONST fldl2e
3498IEMIMPL_FPU_R80_CONST fldpi
3499IEMIMPL_FPU_R80_CONST fldlg2
3500IEMIMPL_FPU_R80_CONST fldln2
3501IEMIMPL_FPU_R80_CONST fldz
3502
3503
3504;;
3505; FPU instruction working on one 80-bit floating point value, outputing two.
3506;
3507; @param 1 The instruction
3508;
3509; @param A0 FPU context (fxsave).
3510; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3511; @param A2 Pointer to the 80-bit value.
3512;
3513%macro IEMIMPL_FPU_R80_R80 1
3514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3515 PROLOGUE_3_ARGS
3516 sub xSP, 20h
3517
3518 fninit
3519 fld tword [A2]
3520 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3521 %1
3522
3523 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3524 fnclex
3525 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3526 fnclex
3527 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3528
3529 fninit
3530 add xSP, 20h
3531 EPILOGUE_3_ARGS
3532ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3533%endmacro
3534
3535IEMIMPL_FPU_R80_R80 fptan
3536IEMIMPL_FPU_R80_R80 fxtract
3537IEMIMPL_FPU_R80_R80 fsincos
3538
3539
3540
3541
3542;---------------------- SSE and MMX Operations ----------------------
3543
3544;; @todo what do we need to do for MMX?
3545%macro IEMIMPL_MMX_PROLOGUE 0
3546%endmacro
3547%macro IEMIMPL_MMX_EPILOGUE 0
3548%endmacro
3549
3550;; @todo what do we need to do for SSE?
3551%macro IEMIMPL_SSE_PROLOGUE 0
3552%endmacro
3553%macro IEMIMPL_SSE_EPILOGUE 0
3554%endmacro
3555
3556;; @todo what do we need to do for AVX?
3557%macro IEMIMPL_AVX_PROLOGUE 0
3558%endmacro
3559%macro IEMIMPL_AVX_EPILOGUE 0
3560%endmacro
3561
3562
3563;;
3564; Media instruction working on two full sized registers.
3565;
3566; @param 1 The instruction
3567; @param 2 Whether there is an MMX variant (1) or not (0).
3568;
3569; @param A0 FPU context (fxsave).
3570; @param A1 Pointer to the first media register size operand (input/output).
3571; @param A2 Pointer to the second media register size operand (input).
3572;
3573%macro IEMIMPL_MEDIA_F2 2
3574%if %2 != 0
3575BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3576 PROLOGUE_3_ARGS
3577 IEMIMPL_MMX_PROLOGUE
3578
3579 movq mm0, [A1]
3580 movq mm1, [A2]
3581 %1 mm0, mm1
3582 movq [A1], mm0
3583
3584 IEMIMPL_MMX_EPILOGUE
3585 EPILOGUE_3_ARGS
3586ENDPROC iemAImpl_ %+ %1 %+ _u64
3587%endif
3588
3589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3590 PROLOGUE_3_ARGS
3591 IEMIMPL_SSE_PROLOGUE
3592
3593 movdqu xmm0, [A1]
3594 movdqu xmm1, [A2]
3595 %1 xmm0, xmm1
3596 movdqu [A1], xmm0
3597
3598 IEMIMPL_SSE_EPILOGUE
3599 EPILOGUE_3_ARGS
3600ENDPROC iemAImpl_ %+ %1 %+ _u128
3601%endmacro
3602
3603IEMIMPL_MEDIA_F2 pshufb, 1
3604IEMIMPL_MEDIA_F2 pand, 1
3605IEMIMPL_MEDIA_F2 pandn, 1
3606IEMIMPL_MEDIA_F2 por, 1
3607IEMIMPL_MEDIA_F2 pxor, 1
3608IEMIMPL_MEDIA_F2 pcmpeqb, 1
3609IEMIMPL_MEDIA_F2 pcmpeqw, 1
3610IEMIMPL_MEDIA_F2 pcmpeqd, 1
3611IEMIMPL_MEDIA_F2 pcmpeqq, 0
3612IEMIMPL_MEDIA_F2 pcmpgtb, 1
3613IEMIMPL_MEDIA_F2 pcmpgtw, 1
3614IEMIMPL_MEDIA_F2 pcmpgtd, 1
3615IEMIMPL_MEDIA_F2 pcmpgtq, 0
3616IEMIMPL_MEDIA_F2 paddb, 1
3617IEMIMPL_MEDIA_F2 paddw, 1
3618IEMIMPL_MEDIA_F2 paddd, 1
3619IEMIMPL_MEDIA_F2 paddq, 1
3620IEMIMPL_MEDIA_F2 paddsb, 1
3621IEMIMPL_MEDIA_F2 paddsw, 1
3622IEMIMPL_MEDIA_F2 paddusb, 1
3623IEMIMPL_MEDIA_F2 paddusw, 1
3624IEMIMPL_MEDIA_F2 psubb, 1
3625IEMIMPL_MEDIA_F2 psubw, 1
3626IEMIMPL_MEDIA_F2 psubd, 1
3627IEMIMPL_MEDIA_F2 psubq, 1
3628IEMIMPL_MEDIA_F2 psubsb, 1
3629IEMIMPL_MEDIA_F2 psubsw, 1
3630IEMIMPL_MEDIA_F2 psubusb, 1
3631IEMIMPL_MEDIA_F2 psubusw, 1
3632IEMIMPL_MEDIA_F2 pmullw, 1
3633IEMIMPL_MEDIA_F2 pmulld, 0
3634IEMIMPL_MEDIA_F2 pmulhw, 1
3635IEMIMPL_MEDIA_F2 pmaddwd, 1
3636IEMIMPL_MEDIA_F2 pminub, 1
3637IEMIMPL_MEDIA_F2 pminuw, 0
3638IEMIMPL_MEDIA_F2 pminud, 0
3639IEMIMPL_MEDIA_F2 pminsb, 0
3640IEMIMPL_MEDIA_F2 pminsw, 1
3641IEMIMPL_MEDIA_F2 pminsd, 0
3642IEMIMPL_MEDIA_F2 pmaxub, 1
3643IEMIMPL_MEDIA_F2 pmaxuw, 0
3644IEMIMPL_MEDIA_F2 pmaxud, 0
3645IEMIMPL_MEDIA_F2 pmaxsb, 0
3646IEMIMPL_MEDIA_F2 pmaxsw, 1
3647IEMIMPL_MEDIA_F2 pmaxsd, 0
3648IEMIMPL_MEDIA_F2 pabsb, 1
3649IEMIMPL_MEDIA_F2 pabsw, 1
3650IEMIMPL_MEDIA_F2 pabsd, 1
3651IEMIMPL_MEDIA_F2 psignb, 1
3652IEMIMPL_MEDIA_F2 psignw, 1
3653IEMIMPL_MEDIA_F2 psignd, 1
3654IEMIMPL_MEDIA_F2 phaddw, 1
3655IEMIMPL_MEDIA_F2 phaddd, 1
3656IEMIMPL_MEDIA_F2 phsubw, 1
3657IEMIMPL_MEDIA_F2 phsubd, 1
3658IEMIMPL_MEDIA_F2 phaddsw, 1
3659IEMIMPL_MEDIA_F2 phsubsw, 1
3660IEMIMPL_MEDIA_F2 pmaddubsw, 1
3661IEMIMPL_MEDIA_F2 pmulhrsw, 1
3662IEMIMPL_MEDIA_F2 pmuludq, 1
3663
3664
3665;;
3666; Media instruction working on two full sized registers, but no FXSAVE state argument.
3667;
3668; @param 1 The instruction
3669; @param 2 Whether there is an MMX variant (1) or not (0).
3670;
3671; @param A0 Pointer to the first media register size operand (input/output).
3672; @param A1 Pointer to the second media register size operand (input).
3673;
3674%macro IEMIMPL_MEDIA_OPT_F2 2
3675%if %2 != 0
3676BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3677 PROLOGUE_2_ARGS
3678 IEMIMPL_MMX_PROLOGUE
3679
3680 movq mm0, [A0]
3681 movq mm1, [A1]
3682 %1 mm0, mm1
3683 movq [A0], mm0
3684
3685 IEMIMPL_MMX_EPILOGUE
3686 EPILOGUE_2_ARGS
3687ENDPROC iemAImpl_ %+ %1 %+ _u64
3688%endif
3689
3690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3691 PROLOGUE_2_ARGS
3692 IEMIMPL_SSE_PROLOGUE
3693
3694 movdqu xmm0, [A0]
3695 movdqu xmm1, [A1]
3696 %1 xmm0, xmm1
3697 movdqu [A0], xmm0
3698
3699 IEMIMPL_SSE_EPILOGUE
3700 EPILOGUE_2_ARGS
3701ENDPROC iemAImpl_ %+ %1 %+ _u128
3702%endmacro
3703
3704IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3705IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3706IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3707IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3708IEMIMPL_MEDIA_OPT_F2 psllw, 1
3709IEMIMPL_MEDIA_OPT_F2 pslld, 1
3710IEMIMPL_MEDIA_OPT_F2 psllq, 1
3711IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3712IEMIMPL_MEDIA_OPT_F2 psrld, 1
3713IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3714IEMIMPL_MEDIA_OPT_F2 psraw, 1
3715IEMIMPL_MEDIA_OPT_F2 psrad, 1
3716IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3717IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3718IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3719IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3720IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3721IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3722IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3723IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3724IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3725
3726;;
3727; Media instruction working on one full sized and one half sized register (lower half).
3728;
3729; @param 1 The instruction
3730; @param 2 1 if MMX is included, 0 if not.
3731;
3732; @param A0 Pointer to the first full sized media register operand (input/output).
3733; @param A1 Pointer to the second half sized media register operand (input).
3734;
3735%macro IEMIMPL_MEDIA_F1L1 2
3736 %if %2 != 0
3737BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3738 PROLOGUE_2_ARGS
3739 IEMIMPL_MMX_PROLOGUE
3740
3741 movq mm0, [A0]
3742 movq mm1, [A1]
3743 %1 mm0, mm1
3744 movq [A0], mm0
3745
3746 IEMIMPL_MMX_EPILOGUE
3747 EPILOGUE_2_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _u64
3749 %endif
3750
3751BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3752 PROLOGUE_2_ARGS
3753 IEMIMPL_SSE_PROLOGUE
3754
3755 movdqu xmm0, [A0]
3756 movdqu xmm1, [A1]
3757 %1 xmm0, xmm1
3758 movdqu [A0], xmm0
3759
3760 IEMIMPL_SSE_EPILOGUE
3761 EPILOGUE_2_ARGS
3762ENDPROC iemAImpl_ %+ %1 %+ _u128
3763%endmacro
3764
3765IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3766IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3767IEMIMPL_MEDIA_F1L1 punpckldq, 1
3768IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3769
3770
3771;;
3772; Media instruction working two half sized input registers (lower half) and a full sized
3773; destination register (vpunpckh*).
3774;
3775; @param 1 The instruction
3776;
3777; @param A0 Pointer to the destination register (full sized, output only).
3778; @param A1 Pointer to the first full sized media source register operand, where we
3779; will only use the lower half as input - but we'll be loading it in full.
3780; @param A2 Pointer to the second full sized media source register operand, where we
3781; will only use the lower half as input - but we'll be loading it in full.
3782;
3783%macro IEMIMPL_MEDIA_F1L1L1 1
3784BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3785 PROLOGUE_3_ARGS
3786 IEMIMPL_AVX_PROLOGUE
3787
3788 vmovdqu xmm0, [A1]
3789 vmovdqu xmm1, [A2]
3790 %1 xmm0, xmm0, xmm1
3791 vmovdqu [A0], xmm0
3792
3793 IEMIMPL_AVX_PROLOGUE
3794 EPILOGUE_3_ARGS
3795ENDPROC iemAImpl_ %+ %1 %+ _u128
3796
3797BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3798 PROLOGUE_3_ARGS
3799 IEMIMPL_AVX_PROLOGUE
3800
3801 vmovdqu ymm0, [A1]
3802 vmovdqu ymm1, [A2]
3803 %1 ymm0, ymm0, ymm1
3804 vmovdqu [A0], ymm0
3805
3806 IEMIMPL_AVX_PROLOGUE
3807 EPILOGUE_3_ARGS
3808ENDPROC iemAImpl_ %+ %1 %+ _u256
3809%endmacro
3810
3811IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3812IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3813IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3814IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3815
3816
3817;;
3818; Media instruction working on one full sized and one half sized register (high half).
3819;
3820; @param 1 The instruction
3821; @param 2 1 if MMX is included, 0 if not.
3822;
3823; @param A0 Pointer to the first full sized media register operand (input/output).
3824; @param A1 Pointer to the second full sized media register operand, where we
3825; will only use the upper half as input - but we'll load it in full.
3826;
3827%macro IEMIMPL_MEDIA_F1H1 2
3828IEMIMPL_MEDIA_F1L1 %1, %2
3829%endmacro
3830
3831IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3832IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3833IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3834IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3835
3836
3837;;
3838; Media instruction working two half sized input registers (high half) and a full sized
3839; destination register (vpunpckh*).
3840;
3841; @param 1 The instruction
3842;
3843; @param A0 Pointer to the destination register (full sized, output only).
3844; @param A1 Pointer to the first full sized media source register operand, where we
3845; will only use the upper half as input - but we'll be loading it in full.
3846; @param A2 Pointer to the second full sized media source register operand, where we
3847; will only use the upper half as input - but we'll be loading it in full.
3848;
3849%macro IEMIMPL_MEDIA_F1H1H1 1
3850IEMIMPL_MEDIA_F1L1L1 %1
3851%endmacro
3852
3853IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3854IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3855IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3856IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3857
3858
3859;
3860; Shufflers with evil 8-bit immediates.
3861;
3862
3863BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3864 PROLOGUE_3_ARGS
3865 IEMIMPL_MMX_PROLOGUE
3866
3867 movq mm1, [A1]
3868 movq mm0, mm0 ; paranoia!
3869 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3870 lea T1, [.imm0 xWrtRIP]
3871 lea T1, [T1 + T0]
3872 call T1
3873 movq [A0], mm0
3874
3875 IEMIMPL_MMX_EPILOGUE
3876 EPILOGUE_3_ARGS
3877%assign bImm 0
3878%rep 256
3879.imm %+ bImm:
3880 pshufw mm0, mm1, bImm
3881 ret
3882 %assign bImm bImm + 1
3883%endrep
3884.immEnd: ; 256*5 == 0x500
3885dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3886dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3887ENDPROC iemAImpl_pshufw_u64
3888
3889
3890%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3892 PROLOGUE_3_ARGS
3893 IEMIMPL_SSE_PROLOGUE
3894
3895 movdqu xmm1, [A1]
3896 movdqu xmm0, xmm1 ; paranoia!
3897 lea T1, [.imm0 xWrtRIP]
3898 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3899 lea T1, [T1 + T0*2]
3900 call T1
3901 movdqu [A0], xmm0
3902
3903 IEMIMPL_SSE_EPILOGUE
3904 EPILOGUE_3_ARGS
3905 %assign bImm 0
3906 %rep 256
3907.imm %+ bImm:
3908 %1 xmm0, xmm1, bImm
3909 ret
3910 %assign bImm bImm + 1
3911 %endrep
3912.immEnd: ; 256*6 == 0x600
3913dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3914dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3915ENDPROC iemAImpl_ %+ %1 %+ _u128
3916%endmacro
3917
3918IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3919IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3920IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3921
3922
3923%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3924BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3925 PROLOGUE_3_ARGS
3926 IEMIMPL_SSE_PROLOGUE
3927
3928 vmovdqu ymm1, [A1]
3929 vmovdqu ymm0, ymm1 ; paranoia!
3930 lea T1, [.imm0 xWrtRIP]
3931 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3932 lea T1, [T1 + T0*2]
3933 call T1
3934 vmovdqu [A0], ymm0
3935
3936 IEMIMPL_SSE_EPILOGUE
3937 EPILOGUE_3_ARGS
3938 %assign bImm 0
3939 %rep 256
3940.imm %+ bImm:
3941 %1 ymm0, ymm1, bImm
3942 ret
3943 %assign bImm bImm + 1
3944 %endrep
3945.immEnd: ; 256*6 == 0x600
3946dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3947dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3948ENDPROC iemAImpl_ %+ %1 %+ _u256
3949%endmacro
3950
3951IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3952IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3953IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3954
3955
3956;
3957; Shifts with evil 8-bit immediates.
3958;
3959
3960%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3962 PROLOGUE_2_ARGS
3963 IEMIMPL_MMX_PROLOGUE
3964
3965 movq mm0, [A0]
3966 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3967 lea T1, [.imm0 xWrtRIP]
3968 lea T1, [T1 + T0]
3969 call T1
3970 movq [A0], mm0
3971
3972 IEMIMPL_MMX_EPILOGUE
3973 EPILOGUE_2_ARGS
3974%assign bImm 0
3975%rep 256
3976.imm %+ bImm:
3977 %1 mm0, bImm
3978 ret
3979 %assign bImm bImm + 1
3980%endrep
3981.immEnd: ; 256*5 == 0x500
3982dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3983dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3984ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
3985%endmacro
3986
3987IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
3988IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
3989IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
3990IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
3991IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
3992IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
3993IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
3994IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
3995
3996
3997%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
3998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
3999 PROLOGUE_2_ARGS
4000 IEMIMPL_SSE_PROLOGUE
4001
4002 movdqu xmm0, [A0]
4003 lea T1, [.imm0 xWrtRIP]
4004 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4005 lea T1, [T1 + T0*2]
4006 call T1
4007 movdqu [A0], xmm0
4008
4009 IEMIMPL_SSE_EPILOGUE
4010 EPILOGUE_2_ARGS
4011 %assign bImm 0
4012 %rep 256
4013.imm %+ bImm:
4014 %1 xmm0, bImm
4015 ret
4016 %assign bImm bImm + 1
4017 %endrep
4018.immEnd: ; 256*6 == 0x600
4019dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4020dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4021ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4022%endmacro
4023
4024IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4025IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4026IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4027IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4028IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4029IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4030IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4031IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4032IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4033IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4034
4035
4036;
4037; Move byte mask.
4038;
4039
4040BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4041 PROLOGUE_2_ARGS
4042 IEMIMPL_MMX_PROLOGUE
4043
4044 movq mm1, [A1]
4045 pmovmskb T0, mm1
4046 mov [A0], T0
4047%ifdef RT_ARCH_X86
4048 mov dword [A0 + 4], 0
4049%endif
4050 IEMIMPL_MMX_EPILOGUE
4051 EPILOGUE_2_ARGS
4052ENDPROC iemAImpl_pmovmskb_u64
4053
4054BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4055 PROLOGUE_2_ARGS
4056 IEMIMPL_SSE_PROLOGUE
4057
4058 movdqu xmm1, [A1]
4059 pmovmskb T0, xmm1
4060 mov [A0], T0
4061%ifdef RT_ARCH_X86
4062 mov dword [A0 + 4], 0
4063%endif
4064 IEMIMPL_SSE_EPILOGUE
4065 EPILOGUE_2_ARGS
4066ENDPROC iemAImpl_pmovmskb_u128
4067
4068BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4069 PROLOGUE_2_ARGS
4070 IEMIMPL_AVX_PROLOGUE
4071
4072 vmovdqu ymm1, [A1]
4073 vpmovmskb T0, ymm1
4074 mov [A0], T0
4075%ifdef RT_ARCH_X86
4076 mov dword [A0 + 4], 0
4077%endif
4078 IEMIMPL_AVX_EPILOGUE
4079 EPILOGUE_2_ARGS
4080ENDPROC iemAImpl_vpmovmskb_u256
4081
4082
4083;;
4084; Media instruction working on two full sized source registers and one destination (AVX).
4085;
4086; @param 1 The instruction
4087;
4088; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4089; @param A1 Pointer to the destination media register size operand (output).
4090; @param A2 Pointer to the first source media register size operand (input).
4091; @param A3 Pointer to the second source media register size operand (input).
4092;
4093%macro IEMIMPL_MEDIA_F3 1
4094BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4095 PROLOGUE_4_ARGS
4096 IEMIMPL_AVX_PROLOGUE
4097
4098 vmovdqu xmm0, [A2]
4099 vmovdqu xmm1, [A3]
4100 %1 xmm0, xmm0, xmm1
4101 vmovdqu [A1], xmm0
4102
4103 IEMIMPL_AVX_PROLOGUE
4104 EPILOGUE_4_ARGS
4105ENDPROC iemAImpl_ %+ %1 %+ _u128
4106
4107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4108 PROLOGUE_4_ARGS
4109 IEMIMPL_AVX_PROLOGUE
4110
4111 vmovdqu ymm0, [A2]
4112 vmovdqu ymm1, [A3]
4113 %1 ymm0, ymm0, ymm1
4114 vmovdqu [A1], ymm0
4115
4116 IEMIMPL_AVX_PROLOGUE
4117 EPILOGUE_4_ARGS
4118ENDPROC iemAImpl_ %+ %1 %+ _u256
4119%endmacro
4120
4121IEMIMPL_MEDIA_F3 vpshufb
4122IEMIMPL_MEDIA_F3 vpand
4123IEMIMPL_MEDIA_F3 vpminub
4124IEMIMPL_MEDIA_F3 vpminuw
4125IEMIMPL_MEDIA_F3 vpminud
4126IEMIMPL_MEDIA_F3 vpminsb
4127IEMIMPL_MEDIA_F3 vpminsw
4128IEMIMPL_MEDIA_F3 vpminsd
4129IEMIMPL_MEDIA_F3 vpmaxub
4130IEMIMPL_MEDIA_F3 vpmaxuw
4131IEMIMPL_MEDIA_F3 vpmaxud
4132IEMIMPL_MEDIA_F3 vpmaxsb
4133IEMIMPL_MEDIA_F3 vpmaxsw
4134IEMIMPL_MEDIA_F3 vpmaxsd
4135IEMIMPL_MEDIA_F3 vpandn
4136IEMIMPL_MEDIA_F3 vpor
4137IEMIMPL_MEDIA_F3 vpxor
4138IEMIMPL_MEDIA_F3 vpcmpeqb
4139IEMIMPL_MEDIA_F3 vpcmpeqw
4140IEMIMPL_MEDIA_F3 vpcmpeqd
4141IEMIMPL_MEDIA_F3 vpcmpeqq
4142IEMIMPL_MEDIA_F3 vpcmpgtb
4143IEMIMPL_MEDIA_F3 vpcmpgtw
4144IEMIMPL_MEDIA_F3 vpcmpgtd
4145IEMIMPL_MEDIA_F3 vpcmpgtq
4146IEMIMPL_MEDIA_F3 vpaddb
4147IEMIMPL_MEDIA_F3 vpaddw
4148IEMIMPL_MEDIA_F3 vpaddd
4149IEMIMPL_MEDIA_F3 vpaddq
4150IEMIMPL_MEDIA_F3 vpsubb
4151IEMIMPL_MEDIA_F3 vpsubw
4152IEMIMPL_MEDIA_F3 vpsubd
4153IEMIMPL_MEDIA_F3 vpsubq
4154
4155
4156;;
4157; Media instruction working on two full sized source registers and one destination (AVX),
4158; but no XSAVE state pointer argument.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 Pointer to the destination media register size operand (output).
4163; @param A1 Pointer to the first source media register size operand (input).
4164; @param A2 Pointer to the second source media register size operand (input).
4165;
4166%macro IEMIMPL_MEDIA_OPT_F3 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4168 PROLOGUE_3_ARGS
4169 IEMIMPL_AVX_PROLOGUE
4170
4171 vmovdqu xmm0, [A1]
4172 vmovdqu xmm1, [A2]
4173 %1 xmm0, xmm0, xmm1
4174 vmovdqu [A0], xmm0
4175
4176 IEMIMPL_AVX_PROLOGUE
4177 EPILOGUE_3_ARGS
4178ENDPROC iemAImpl_ %+ %1 %+ _u128
4179
4180BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4181 PROLOGUE_3_ARGS
4182 IEMIMPL_AVX_PROLOGUE
4183
4184 vmovdqu ymm0, [A1]
4185 vmovdqu ymm1, [A2]
4186 %1 ymm0, ymm0, ymm1
4187 vmovdqu [A0], ymm0
4188
4189 IEMIMPL_AVX_PROLOGUE
4190 EPILOGUE_3_ARGS
4191ENDPROC iemAImpl_ %+ %1 %+ _u256
4192%endmacro
4193
4194IEMIMPL_MEDIA_OPT_F3 vpacksswb
4195IEMIMPL_MEDIA_OPT_F3 vpackssdw
4196IEMIMPL_MEDIA_OPT_F3 vpackuswb
4197IEMIMPL_MEDIA_OPT_F3 vpackusdw
4198IEMIMPL_MEDIA_OPT_F3 vpmullw
4199IEMIMPL_MEDIA_OPT_F3 vpmulld
4200IEMIMPL_MEDIA_OPT_F3 vpmulhw
4201IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4202IEMIMPL_MEDIA_OPT_F3 vpavgb
4203IEMIMPL_MEDIA_OPT_F3 vpavgw
4204IEMIMPL_MEDIA_OPT_F3 vpsignb
4205IEMIMPL_MEDIA_OPT_F3 vpsignw
4206IEMIMPL_MEDIA_OPT_F3 vpsignd
4207IEMIMPL_MEDIA_OPT_F3 vphaddw
4208IEMIMPL_MEDIA_OPT_F3 vphaddd
4209IEMIMPL_MEDIA_OPT_F3 vphsubw
4210IEMIMPL_MEDIA_OPT_F3 vphsubd
4211IEMIMPL_MEDIA_OPT_F3 vphaddsw
4212IEMIMPL_MEDIA_OPT_F3 vphsubsw
4213IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4214IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4215IEMIMPL_MEDIA_OPT_F3 vpsadbw
4216IEMIMPL_MEDIA_OPT_F3 vpmuldq
4217IEMIMPL_MEDIA_OPT_F3 vpmuludq
4218IEMIMPL_MEDIA_OPT_F3 vunpcklps
4219IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4220IEMIMPL_MEDIA_OPT_F3 vunpckhps
4221IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4222
4223;;
4224; Media instruction working on one full sized source registers and one destination (AVX),
4225; but no XSAVE state pointer argument.
4226;
4227; @param 1 The instruction
4228;
4229; @param A0 Pointer to the destination media register size operand (output).
4230; @param A1 Pointer to the source media register size operand (input).
4231;
4232%macro IEMIMPL_MEDIA_OPT_F2_AVX 1
4233BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4234 PROLOGUE_2_ARGS
4235 IEMIMPL_AVX_PROLOGUE
4236
4237 vmovdqu xmm0, [A1]
4238 %1 xmm0, xmm0
4239 vmovdqu [A0], xmm0
4240
4241 IEMIMPL_AVX_PROLOGUE
4242 EPILOGUE_2_ARGS
4243ENDPROC iemAImpl_ %+ %1 %+ _u128
4244
4245BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4246 PROLOGUE_2_ARGS
4247 IEMIMPL_AVX_PROLOGUE
4248
4249 vmovdqu ymm0, [A1]
4250 %1 ymm0, ymm0
4251 vmovdqu [A0], ymm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_2_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u256
4256%endmacro
4257
4258IEMIMPL_MEDIA_OPT_F2_AVX vpabsb
4259IEMIMPL_MEDIA_OPT_F2_AVX vpabsw
4260IEMIMPL_MEDIA_OPT_F2_AVX vpabsd
4261
4262
4263;
4264; The SSE 4.2 crc32
4265;
4266; @param A1 Pointer to the 32-bit destination.
4267; @param A2 The source operand, sized according to the suffix.
4268;
4269BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4270 PROLOGUE_2_ARGS
4271
4272 mov T0_32, [A0]
4273 crc32 T0_32, A1_8
4274 mov [A0], T0_32
4275
4276 EPILOGUE_2_ARGS
4277ENDPROC iemAImpl_crc32_u8
4278
4279BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4280 PROLOGUE_2_ARGS
4281
4282 mov T0_32, [A0]
4283 crc32 T0_32, A1_16
4284 mov [A0], T0_32
4285
4286 EPILOGUE_2_ARGS
4287ENDPROC iemAImpl_crc32_u16
4288
4289BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4290 PROLOGUE_2_ARGS
4291
4292 mov T0_32, [A0]
4293 crc32 T0_32, A1_32
4294 mov [A0], T0_32
4295
4296 EPILOGUE_2_ARGS
4297ENDPROC iemAImpl_crc32_u32
4298
4299%ifdef RT_ARCH_AMD64
4300BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4301 PROLOGUE_2_ARGS
4302
4303 mov T0_32, [A0]
4304 crc32 T0, A1
4305 mov [A0], T0_32
4306
4307 EPILOGUE_2_ARGS
4308ENDPROC iemAImpl_crc32_u64
4309%endif
4310
4311
4312;
4313; PTEST (SSE 4.1)
4314;
4315; @param A0 Pointer to the first source operand (aka readonly destination).
4316; @param A1 Pointer to the second source operand.
4317; @param A2 Pointer to the EFLAGS register.
4318;
4319BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4320 PROLOGUE_3_ARGS
4321 IEMIMPL_SSE_PROLOGUE
4322
4323 movdqu xmm0, [A0]
4324 movdqu xmm1, [A1]
4325 ptest xmm0, xmm1
4326 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4327
4328 IEMIMPL_SSE_EPILOGUE
4329 EPILOGUE_3_ARGS
4330ENDPROC iemAImpl_ptest_u128
4331
4332BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4333 PROLOGUE_3_ARGS
4334 IEMIMPL_SSE_PROLOGUE
4335
4336 vmovdqu ymm0, [A0]
4337 vmovdqu ymm1, [A1]
4338 vptest ymm0, ymm1
4339 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4340
4341 IEMIMPL_SSE_EPILOGUE
4342 EPILOGUE_3_ARGS
4343ENDPROC iemAImpl_vptest_u256
4344
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette