VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95341

Last change on this file since 95341 was 95341, checked in by vboxsync, 2 years ago

VMM/IEM: Implemented the BLSR, BLSMSK and BLSI instructions. bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 103.4 KB
Line 
1; $Id: IEMAllAImpl.asm 95341 2022-06-22 10:37:37Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle Corporation
8;
9; This file is part of VirtualBox Open Source Edition (OSE), as
10; available from http://www.virtualbox.org. This file is free software;
11; you can redistribute it and/or modify it under the terms of the GNU
12; General Public License (GPL) as published by the Free Software
13; Foundation, in version 2 as it comes in the "COPYING" file of the
14; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16;
17
18
19;*********************************************************************************************************************************
20;* Header Files *
21;*********************************************************************************************************************************
22%include "VBox/asmdefs.mac"
23%include "VBox/err.mac"
24%include "iprt/x86.mac"
25
26
27;*********************************************************************************************************************************
28;* Defined Constants And Macros *
29;*********************************************************************************************************************************
30
31;;
32; RET XX / RET wrapper for fastcall.
33;
34%macro RET_FASTCALL 1
35%ifdef RT_ARCH_X86
36 %ifdef RT_OS_WINDOWS
37 ret %1
38 %else
39 ret
40 %endif
41%else
42 ret
43%endif
44%endmacro
45
46;;
47; NAME for fastcall functions.
48;
49;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50; escaping (or whatever the dollar is good for here). Thus the ugly
51; prefix argument.
52;
53%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54%ifdef RT_ARCH_X86
55 %ifdef RT_OS_WINDOWS
56 %undef NAME_FASTCALL
57 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58 %endif
59%endif
60
61;;
62; BEGINPROC for fastcall functions.
63;
64; @param 1 The function name (C).
65; @param 2 The argument size on x86.
66;
67%macro BEGINPROC_FASTCALL 2
68 %ifdef ASM_FORMAT_PE
69 export %1=NAME_FASTCALL(%1,%2,$@)
70 %endif
71 %ifdef __NASM__
72 %ifdef ASM_FORMAT_OMF
73 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74 %endif
75 %endif
76 %ifndef ASM_FORMAT_BIN
77 global NAME_FASTCALL(%1,%2,$@)
78 %endif
79NAME_FASTCALL(%1,%2,@):
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %endif
143
144 %ifdef ASM_CALL64_MSC
145 %define A0 rcx
146 %define A0_32 ecx
147 %define A0_16 cx
148 %define A0_8 cl
149
150 %define A1 rdx
151 %define A1_32 edx
152 %define A1_16 dx
153 %define A1_8 dl
154
155 %define A2 r8
156 %define A2_32 r8d
157 %define A2_16 r8w
158 %define A2_8 r8b
159
160 %define A3 r9
161 %define A3_32 r9d
162 %define A3_16 r9w
163 %endif
164
165 %define T0 rax
166 %define T0_32 eax
167 %define T0_16 ax
168 %define T0_8 al
169
170 %define T1 r11
171 %define T1_32 r11d
172 %define T1_16 r11w
173 %define T1_8 r11b
174
175 %define T2 r10 ; only AMD64
176 %define T2_32 r10d
177 %define T2_16 r10w
178 %define T2_8 r10b
179
180%else
181 ; x86
182 %macro PROLOGUE_1_ARGS 0
183 push edi
184 %endmacro
185 %macro EPILOGUE_1_ARGS 0
186 pop edi
187 ret 0
188 %endmacro
189 %macro EPILOGUE_1_ARGS_EX 1
190 pop edi
191 ret %1
192 %endmacro
193
194 %macro PROLOGUE_2_ARGS 0
195 push edi
196 %endmacro
197 %macro EPILOGUE_2_ARGS 0
198 pop edi
199 ret 0
200 %endmacro
201 %macro EPILOGUE_2_ARGS_EX 1
202 pop edi
203 ret %1
204 %endmacro
205
206 %macro PROLOGUE_3_ARGS 0
207 push ebx
208 mov ebx, [esp + 4 + 4]
209 push edi
210 %endmacro
211 %macro EPILOGUE_3_ARGS_EX 1
212 %if (%1) < 4
213 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214 %endif
215 pop edi
216 pop ebx
217 ret %1
218 %endmacro
219 %macro EPILOGUE_3_ARGS 0
220 EPILOGUE_3_ARGS_EX 4
221 %endmacro
222
223 %macro PROLOGUE_4_ARGS 0
224 push ebx
225 push edi
226 push esi
227 mov ebx, [esp + 12 + 4 + 0]
228 mov esi, [esp + 12 + 4 + 4]
229 %endmacro
230 %macro EPILOGUE_4_ARGS_EX 1
231 %if (%1) < 8
232 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233 %endif
234 pop esi
235 pop edi
236 pop ebx
237 ret %1
238 %endmacro
239 %macro EPILOGUE_4_ARGS 0
240 EPILOGUE_4_ARGS_EX 8
241 %endmacro
242
243 %define A0 ecx
244 %define A0_32 ecx
245 %define A0_16 cx
246 %define A0_8 cl
247
248 %define A1 edx
249 %define A1_32 edx
250 %define A1_16 dx
251 %define A1_8 dl
252
253 %define A2 ebx
254 %define A2_32 ebx
255 %define A2_16 bx
256 %define A2_8 bl
257
258 %define A3 esi
259 %define A3_32 esi
260 %define A3_16 si
261
262 %define T0 eax
263 %define T0_32 eax
264 %define T0_16 ax
265 %define T0_8 al
266
267 %define T1 edi
268 %define T1_32 edi
269 %define T1_16 di
270%endif
271
272
273;;
274; Load the relevant flags from [%1] if there are undefined flags (%3).
275;
276; @remarks Clobbers T0, stack. Changes EFLAGS.
277; @param A2 The register pointing to the flags.
278; @param 1 The parameter (A0..A3) pointing to the eflags.
279; @param 2 The set of modified flags.
280; @param 3 The set of undefined flags.
281;
282%macro IEM_MAYBE_LOAD_FLAGS 3
283 ;%if (%3) != 0
284 pushf ; store current flags
285 mov T0_32, [%1] ; load the guest flags
286 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
287 and T0_32, (%2 | %3) ; select the modified and undefined flags.
288 or [xSP], T0 ; merge guest flags with host flags.
289 popf ; load the mixed flags.
290 ;%endif
291%endmacro
292
293;;
294; Update the flag.
295;
296; @remarks Clobbers T0, T1, stack.
297; @param 1 The register pointing to the EFLAGS.
298; @param 2 The mask of modified flags to save.
299; @param 3 The mask of undefined flags to (maybe) save.
300;
301%macro IEM_SAVE_FLAGS 3
302 %if (%2 | %3) != 0
303 pushf
304 pop T1
305 mov T0_32, [%1] ; flags
306 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
307 and T1_32, (%2 | %3) ; select the modified and undefined flags.
308 or T0_32, T1_32 ; combine the flags.
309 mov [%1], T0_32 ; save the flags.
310 %endif
311%endmacro
312
313;;
314; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 Mask of additional flags to always clear
320; @param 4 Mask of additional flags to always set.
321;
322%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323 %if (%2 | %3 | %4) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; load flags.
327 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
328 and T1_32, (%2) ; select the modified flags.
329 or T0_32, T1_32 ; combine the flags.
330 %if (%4) != 0
331 or T0_32, %4 ; add the always set flags.
332 %endif
333 mov [%1], T0_32 ; save the result.
334 %endif
335%endmacro
336
337;;
338; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339; signed input (%4[%5]) and parity index (%6).
340;
341; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344;
345; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 The result register to set SF by.
350; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351; @param 6 The (full) register containing the parity table index. Will be modified!
352
353%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354 %ifdef RT_ARCH_AMD64
355 pushf
356 pop T2
357 %else
358 push T0
359 pushf
360 pop T0
361 %endif
362 mov T1_32, [%1] ; load flags.
363 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364 %ifdef RT_ARCH_AMD64
365 and T2_32, (%2) ; select the modified flags.
366 or T1_32, T2_32 ; combine the flags.
367 %else
368 and T0_32, (%2) ; select the modified flags.
369 or T1_32, T0_32 ; combine the flags.
370 pop T0
371 %endif
372
373 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
374 bt %4, %5 - 1
375 jnc %%sf_clear
376 or T1_32, X86_EFL_SF
377 %%sf_clear:
378
379 ; Parity last.
380 and %6, 0xff
381 %ifdef RT_ARCH_AMD64
382 lea T2, [NAME(g_afParity) xWrtRIP]
383 or T1_8, [T2 + %6]
384 %else
385 or T1_8, [NAME(g_afParity) + %6]
386 %endif
387
388 mov [%1], T1_32 ; save the result.
389%endmacro
390
391;;
392; Calculates the new EFLAGS using fixed clear and set bit masks.
393;
394; @remarks Clobbers T0.
395; @param 1 The register pointing to the EFLAGS.
396; @param 2 Mask of additional flags to always clear
397; @param 3 Mask of additional flags to always set.
398;
399%macro IEM_ADJUST_FLAGS 3
400 %if (%2 | %3) != 0
401 mov T0_32, [%1] ; Load flags.
402 %if (%2) != 0
403 and T0_32, ~(%2) ; Remove the always cleared flags.
404 %endif
405 %if (%3) != 0
406 or T0_32, %3 ; Add the always set flags.
407 %endif
408 mov [%1], T0_32 ; Save the result.
409 %endif
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0, %4, EFLAGS.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419; @param 4 The (full) register containing the parity table index. Will be modified!
420;
421%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422 mov T0_32, [%1] ; Load flags.
423 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
424 %if (%3) != 0
425 or T0_32, %3 ; Add the always set flags.
426 %endif
427 and %4, 0xff
428 %ifdef RT_ARCH_AMD64
429 lea T2, [NAME(g_afParity) xWrtRIP]
430 or T0_8, [T2 + %4]
431 %else
432 or T0_8, [NAME(g_afParity) + %4]
433 %endif
434 mov [%1], T0_32 ; Save the result.
435%endmacro
436
437
438;*********************************************************************************************************************************
439;* External Symbols *
440;*********************************************************************************************************************************
441extern NAME(g_afParity)
442
443
444;;
445; Macro for implementing a binary operator.
446;
447; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448; variants, except on 32-bit system where the 64-bit accesses requires hand
449; coding.
450;
451; All the functions takes a pointer to the destination memory operand in A0,
452; the source register operand in A1 and a pointer to eflags in A2.
453;
454; @param 1 The instruction mnemonic.
455; @param 2 Non-zero if there should be a locked version.
456; @param 3 The modified flags.
457; @param 4 The undefined flags.
458;
459%macro IEMIMPL_BIN_OP 4
460BEGINCODE
461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462 PROLOGUE_3_ARGS
463 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464 %1 byte [A0], A1_8
465 IEM_SAVE_FLAGS A2, %3, %4
466 EPILOGUE_3_ARGS
467ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470 PROLOGUE_3_ARGS
471 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472 %1 word [A0], A1_16
473 IEM_SAVE_FLAGS A2, %3, %4
474 EPILOGUE_3_ARGS
475ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478 PROLOGUE_3_ARGS
479 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480 %1 dword [A0], A1_32
481 IEM_SAVE_FLAGS A2, %3, %4
482 EPILOGUE_3_ARGS
483ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485 %ifdef RT_ARCH_AMD64
486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487 PROLOGUE_3_ARGS
488 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489 %1 qword [A0], A1
490 IEM_SAVE_FLAGS A2, %3, %4
491 EPILOGUE_3_ARGS_EX 8
492ENDPROC iemAImpl_ %+ %1 %+ _u64
493 %endif ; RT_ARCH_AMD64
494
495 %if %2 != 0 ; locked versions requested?
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 lock %1 byte [A0], A1_8
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 lock %1 word [A0], A1_16
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514 PROLOGUE_3_ARGS
515 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516 lock %1 dword [A0], A1_32
517 IEM_SAVE_FLAGS A2, %3, %4
518 EPILOGUE_3_ARGS
519ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521 %ifdef RT_ARCH_AMD64
522BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523 PROLOGUE_3_ARGS
524 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525 lock %1 qword [A0], A1
526 IEM_SAVE_FLAGS A2, %3, %4
527 EPILOGUE_3_ARGS_EX 8
528ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529 %endif ; RT_ARCH_AMD64
530 %endif ; locked
531%endmacro
532
533; instr,lock, modified-flags, undefined flags
534IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
535IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
536IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
537IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
538IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
539IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
540IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
541IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
542IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
543
544
545;;
546; Macro for implementing a binary operator, VEX variant with separate input/output.
547;
548; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549; where the 64-bit accesses requires hand coding.
550;
551; All the functions takes a pointer to the destination memory operand in A0,
552; the first source register operand in A1, the second source register operand
553; in A2 and a pointer to eflags in A3.
554;
555; @param 1 The instruction mnemonic.
556; @param 2 The modified flags.
557; @param 3 The undefined flags.
558;
559%macro IEMIMPL_VEX_BIN_OP 3
560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561 PROLOGUE_4_ARGS
562 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563 %1 T0_32, A1_32, A2_32
564 mov [A0], T0_32
565 IEM_SAVE_FLAGS A3, %2, %3
566 EPILOGUE_4_ARGS
567ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569 %ifdef RT_ARCH_AMD64
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0, A1, A2
574 mov [A0], T0
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u64
578 %endif ; RT_ARCH_AMD64
579%endmacro
580
581; instr, modified-flags, undefined-flags
582IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
583IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
584
585;;
586; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
587;
588; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
589; where the 64-bit accesses requires hand coding.
590;
591; All the functions takes a pointer to the destination memory operand in A0,
592; the source register operand in A1 and a pointer to eflags in A2.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP_2 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
602 mov T0_32, [A0]
603 %1 T0_32, A1_32
604 mov [A0], T0_32
605 IEM_SAVE_FLAGS A2, %2, %3
606 EPILOGUE_4_ARGS
607ENDPROC iemAImpl_ %+ %1 %+ _u32
608
609 %ifdef RT_ARCH_AMD64
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0, [A0]
614 %1 T0, A1
615 mov [A0], T0
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u64
619 %endif ; RT_ARCH_AMD64
620%endmacro
621
622; instr, modified-flags, undefined-flags
623IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
625IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
626
627
628;;
629; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
630;
631; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
632; where the 64-bit accesses requires hand coding.
633;
634; All the functions takes a pointer to the destination memory operand in A0,
635; the first source register operand in A1, the second source register operand
636; in A2 and a pointer to eflags in A3.
637;
638; @param 1 The instruction mnemonic.
639;
640%macro IEMIMPL_VEX_BIN_OP_NOEFL 2
641BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
642 PROLOGUE_3_ARGS
643 %1 T0_32, A1_32, A2_32
644 mov [A0], T0_32
645 EPILOGUE_3_ARGS
646ENDPROC iemAImpl_ %+ %1 %+ _u32
647
648BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
649 PROLOGUE_3_ARGS
650 %ifdef ASM_CALL64_GCC
651 mov cl, A2_8
652 %2 A1_32, cl
653 mov [A0], A1_32
654 %else
655 xchg A2, A0
656 %2 A1_32, cl
657 mov [A2], A1_32
658 %endif
659 EPILOGUE_3_ARGS
660ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
661
662 %ifdef RT_ARCH_AMD64
663BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
664 PROLOGUE_3_ARGS
665 %1 T0, A1, A2
666 mov [A0], T0
667 EPILOGUE_3_ARGS
668ENDPROC iemAImpl_ %+ %1 %+ _u64
669
670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
671 PROLOGUE_3_ARGS
672 %ifdef ASM_CALL64_GCC
673 mov cl, A2_8
674 %2 A1, cl
675 mov [A0], A1_32
676 %else
677 xchg A2, A0
678 %2 A1, cl
679 mov [A2], A1_32
680 %endif
681 mov [A0], A1
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
684 %endif ; RT_ARCH_AMD64
685%endmacro
686
687; instr, fallback instr
688IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar
689IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl
690IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr
691
692
693;
694; RORX uses a immediate byte for the shift count, so we only do
695; fallback implementation of that one.
696;
697BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
698 PROLOGUE_3_ARGS
699 %ifdef ASM_CALL64_GCC
700 mov cl, A2_8
701 ror A1_32, cl
702 mov [A0], A1_32
703 %else
704 xchg A2, A0
705 ror A1_32, cl
706 mov [A2], A1_32
707 %endif
708 EPILOGUE_3_ARGS
709ENDPROC iemAImpl_rorx_u32
710
711 %ifdef RT_ARCH_AMD64
712BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
713 PROLOGUE_3_ARGS
714 %ifdef ASM_CALL64_GCC
715 mov cl, A2_8
716 ror A1, cl
717 mov [A0], A1_32
718 %else
719 xchg A2, A0
720 ror A1, cl
721 mov [A2], A1_32
722 %endif
723 mov [A0], A1
724 EPILOGUE_3_ARGS
725ENDPROC iemAImpl_rorx_u64
726 %endif ; RT_ARCH_AMD64
727
728
729;;
730; Macro for implementing a bit operator.
731;
732; This will generate code for the 16, 32 and 64 bit accesses with locked
733; variants, except on 32-bit system where the 64-bit accesses requires hand
734; coding.
735;
736; All the functions takes a pointer to the destination memory operand in A0,
737; the source register operand in A1 and a pointer to eflags in A2.
738;
739; @param 1 The instruction mnemonic.
740; @param 2 Non-zero if there should be a locked version.
741; @param 3 The modified flags.
742; @param 4 The undefined flags.
743;
744%macro IEMIMPL_BIT_OP 4
745BEGINCODE
746BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
747 PROLOGUE_3_ARGS
748 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
749 %1 word [A0], A1_16
750 IEM_SAVE_FLAGS A2, %3, %4
751 EPILOGUE_3_ARGS
752ENDPROC iemAImpl_ %+ %1 %+ _u16
753
754BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
755 PROLOGUE_3_ARGS
756 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
757 %1 dword [A0], A1_32
758 IEM_SAVE_FLAGS A2, %3, %4
759 EPILOGUE_3_ARGS
760ENDPROC iemAImpl_ %+ %1 %+ _u32
761
762 %ifdef RT_ARCH_AMD64
763BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
764 PROLOGUE_3_ARGS
765 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
766 %1 qword [A0], A1
767 IEM_SAVE_FLAGS A2, %3, %4
768 EPILOGUE_3_ARGS_EX 8
769ENDPROC iemAImpl_ %+ %1 %+ _u64
770 %endif ; RT_ARCH_AMD64
771
772 %if %2 != 0 ; locked versions requested?
773
774BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
775 PROLOGUE_3_ARGS
776 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
777 lock %1 word [A0], A1_16
778 IEM_SAVE_FLAGS A2, %3, %4
779 EPILOGUE_3_ARGS
780ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
781
782BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
783 PROLOGUE_3_ARGS
784 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
785 lock %1 dword [A0], A1_32
786 IEM_SAVE_FLAGS A2, %3, %4
787 EPILOGUE_3_ARGS
788ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
789
790 %ifdef RT_ARCH_AMD64
791BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
792 PROLOGUE_3_ARGS
793 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
794 lock %1 qword [A0], A1
795 IEM_SAVE_FLAGS A2, %3, %4
796 EPILOGUE_3_ARGS_EX 8
797ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
798 %endif ; RT_ARCH_AMD64
799 %endif ; locked
800%endmacro
801IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
802IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
803IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
804IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
805
806;;
807; Macro for implementing a bit search operator.
808;
809; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
810; system where the 64-bit accesses requires hand coding.
811;
812; All the functions takes a pointer to the destination memory operand in A0,
813; the source register operand in A1 and a pointer to eflags in A2.
814;
815; In the ZF case the destination register is 'undefined', however it seems that
816; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
817; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
818; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
819; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
820;
821; @param 1 The instruction mnemonic.
822; @param 2 The modified flags.
823; @param 3 The undefined flags.
824; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
825;
826%macro IEMIMPL_BIT_OP2 4
827BEGINCODE
828BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
829 PROLOGUE_3_ARGS
830 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
831 %1 T0_16, A1_16
832%if %4 != 0
833 jz .unchanged_dst
834%endif
835 mov [A0], T0_16
836.unchanged_dst:
837 IEM_SAVE_FLAGS A2, %2, %3
838 EPILOGUE_3_ARGS
839ENDPROC iemAImpl_ %+ %1 %+ _u16
840
841BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
842 PROLOGUE_3_ARGS
843 %1 T1_16, A1_16
844%if %4 != 0
845 jz .unchanged_dst
846%endif
847 mov [A0], T1_16
848 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
849 EPILOGUE_3_ARGS
850.unchanged_dst:
851 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
852 EPILOGUE_3_ARGS
853ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
854
855BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
856 PROLOGUE_3_ARGS
857 %1 T0_16, A1_16
858%if %4 != 0
859 jz .unchanged_dst
860%endif
861 mov [A0], T0_16
862.unchanged_dst:
863 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
864 EPILOGUE_3_ARGS
865ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
866
867
868BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
869 PROLOGUE_3_ARGS
870 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
871 %1 T0_32, A1_32
872%if %4 != 0
873 jz .unchanged_dst
874%endif
875 mov [A0], T0_32
876.unchanged_dst:
877 IEM_SAVE_FLAGS A2, %2, %3
878 EPILOGUE_3_ARGS
879ENDPROC iemAImpl_ %+ %1 %+ _u32
880
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
882 PROLOGUE_3_ARGS
883 %1 T1_32, A1_32
884%if %4 != 0
885 jz .unchanged_dst
886%endif
887 mov [A0], T1_32
888 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
889 EPILOGUE_3_ARGS
890.unchanged_dst:
891 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
892 EPILOGUE_3_ARGS
893ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
894
895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
896 PROLOGUE_3_ARGS
897 %1 T0_32, A1_32
898%if %4 != 0
899 jz .unchanged_dst
900%endif
901 mov [A0], T0_32
902.unchanged_dst:
903 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
904 EPILOGUE_3_ARGS
905ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
906
907
908 %ifdef RT_ARCH_AMD64
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
913 %1 T0, A1
914%if %4 != 0
915 jz .unchanged_dst
916%endif
917 mov [A0], T0
918.unchanged_dst:
919 IEM_SAVE_FLAGS A2, %2, %3
920 EPILOGUE_3_ARGS_EX 8
921ENDPROC iemAImpl_ %+ %1 %+ _u64
922
923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
924 PROLOGUE_3_ARGS
925 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
926 %1 T1, A1
927%if %4 != 0
928 jz .unchanged_dst
929%endif
930 mov [A0], T1
931 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
932 EPILOGUE_3_ARGS
933.unchanged_dst:
934 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
935 EPILOGUE_3_ARGS
936ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
937
938BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
939 PROLOGUE_3_ARGS
940 %1 T0, A1
941%if %4 != 0
942 jz .unchanged_dst
943%endif
944 mov [A0], T0
945.unchanged_dst:
946 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
947 EPILOGUE_3_ARGS_EX 8
948ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
949
950 %endif ; RT_ARCH_AMD64
951%endmacro
952
953IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
954IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
955IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
956IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
957
958
959;
960; IMUL is also a similar but yet different case (no lock, no mem dst).
961; The rDX:rAX variant of imul is handled together with mul further down.
962;
963BEGINCODE
964; @param 1 EFLAGS that are modified.
965; @param 2 Undefined EFLAGS.
966; @param 3 Function suffix.
967; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
968; 2 for AMD (set AF, clear PF, ZF and SF).
969%macro IEMIMPL_IMUL_TWO 4
970BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
971 PROLOGUE_3_ARGS
972 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
973 imul A1_16, word [A0]
974 mov [A0], A1_16
975 %if %4 != 1
976 IEM_SAVE_FLAGS A2, %1, %2
977 %else
978 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
979 %endif
980 EPILOGUE_3_ARGS
981ENDPROC iemAImpl_imul_two_u16 %+ %3
982
983BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
984 PROLOGUE_3_ARGS
985 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
986 imul A1_32, dword [A0]
987 mov [A0], A1_32
988 %if %4 != 1
989 IEM_SAVE_FLAGS A2, %1, %2
990 %else
991 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
992 %endif
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_imul_two_u32 %+ %3
995
996 %ifdef RT_ARCH_AMD64
997BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1000 imul A1, qword [A0]
1001 mov [A0], A1
1002 %if %4 != 1
1003 IEM_SAVE_FLAGS A2, %1, %2
1004 %else
1005 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1006 %endif
1007 EPILOGUE_3_ARGS_EX 8
1008ENDPROC iemAImpl_imul_two_u64 %+ %3
1009 %endif ; RT_ARCH_AMD64
1010%endmacro
1011IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1012IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1013IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1014
1015
1016;
1017; XCHG for memory operands. This implies locking. No flag changes.
1018;
1019; Each function takes two arguments, first the pointer to the memory,
1020; then the pointer to the register. They all return void.
1021;
1022BEGINCODE
1023BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1024 PROLOGUE_2_ARGS
1025 mov T0_8, [A1]
1026 xchg [A0], T0_8
1027 mov [A1], T0_8
1028 EPILOGUE_2_ARGS
1029ENDPROC iemAImpl_xchg_u8_locked
1030
1031BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1032 PROLOGUE_2_ARGS
1033 mov T0_16, [A1]
1034 xchg [A0], T0_16
1035 mov [A1], T0_16
1036 EPILOGUE_2_ARGS
1037ENDPROC iemAImpl_xchg_u16_locked
1038
1039BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1040 PROLOGUE_2_ARGS
1041 mov T0_32, [A1]
1042 xchg [A0], T0_32
1043 mov [A1], T0_32
1044 EPILOGUE_2_ARGS
1045ENDPROC iemAImpl_xchg_u32_locked
1046
1047%ifdef RT_ARCH_AMD64
1048BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1049 PROLOGUE_2_ARGS
1050 mov T0, [A1]
1051 xchg [A0], T0
1052 mov [A1], T0
1053 EPILOGUE_2_ARGS
1054ENDPROC iemAImpl_xchg_u64_locked
1055%endif
1056
1057; Unlocked variants for fDisregardLock mode.
1058
1059BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1060 PROLOGUE_2_ARGS
1061 mov T0_8, [A1]
1062 mov T1_8, [A0]
1063 mov [A0], T0_8
1064 mov [A1], T1_8
1065 EPILOGUE_2_ARGS
1066ENDPROC iemAImpl_xchg_u8_unlocked
1067
1068BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1069 PROLOGUE_2_ARGS
1070 mov T0_16, [A1]
1071 mov T1_16, [A0]
1072 mov [A0], T0_16
1073 mov [A1], T1_16
1074 EPILOGUE_2_ARGS
1075ENDPROC iemAImpl_xchg_u16_unlocked
1076
1077BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1078 PROLOGUE_2_ARGS
1079 mov T0_32, [A1]
1080 mov T1_32, [A0]
1081 mov [A0], T0_32
1082 mov [A1], T1_32
1083 EPILOGUE_2_ARGS
1084ENDPROC iemAImpl_xchg_u32_unlocked
1085
1086%ifdef RT_ARCH_AMD64
1087BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1088 PROLOGUE_2_ARGS
1089 mov T0, [A1]
1090 mov T1, [A0]
1091 mov [A0], T0
1092 mov [A1], T1
1093 EPILOGUE_2_ARGS
1094ENDPROC iemAImpl_xchg_u64_unlocked
1095%endif
1096
1097
1098;
1099; XADD for memory operands.
1100;
1101; Each function takes three arguments, first the pointer to the
1102; memory/register, then the pointer to the register, and finally a pointer to
1103; eflags. They all return void.
1104;
1105BEGINCODE
1106BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1107 PROLOGUE_3_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1109 mov T0_8, [A1]
1110 xadd [A0], T0_8
1111 mov [A1], T0_8
1112 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_xadd_u8
1115
1116BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1119 mov T0_16, [A1]
1120 xadd [A0], T0_16
1121 mov [A1], T0_16
1122 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1123 EPILOGUE_3_ARGS
1124ENDPROC iemAImpl_xadd_u16
1125
1126BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1129 mov T0_32, [A1]
1130 xadd [A0], T0_32
1131 mov [A1], T0_32
1132 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1133 EPILOGUE_3_ARGS
1134ENDPROC iemAImpl_xadd_u32
1135
1136%ifdef RT_ARCH_AMD64
1137BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1138 PROLOGUE_3_ARGS
1139 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1140 mov T0, [A1]
1141 xadd [A0], T0
1142 mov [A1], T0
1143 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_xadd_u64
1146%endif ; RT_ARCH_AMD64
1147
1148BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1151 mov T0_8, [A1]
1152 lock xadd [A0], T0_8
1153 mov [A1], T0_8
1154 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1155 EPILOGUE_3_ARGS
1156ENDPROC iemAImpl_xadd_u8_locked
1157
1158BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1159 PROLOGUE_3_ARGS
1160 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1161 mov T0_16, [A1]
1162 lock xadd [A0], T0_16
1163 mov [A1], T0_16
1164 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1165 EPILOGUE_3_ARGS
1166ENDPROC iemAImpl_xadd_u16_locked
1167
1168BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1169 PROLOGUE_3_ARGS
1170 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1171 mov T0_32, [A1]
1172 lock xadd [A0], T0_32
1173 mov [A1], T0_32
1174 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1175 EPILOGUE_3_ARGS
1176ENDPROC iemAImpl_xadd_u32_locked
1177
1178%ifdef RT_ARCH_AMD64
1179BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1180 PROLOGUE_3_ARGS
1181 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1182 mov T0, [A1]
1183 lock xadd [A0], T0
1184 mov [A1], T0
1185 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1186 EPILOGUE_3_ARGS
1187ENDPROC iemAImpl_xadd_u64_locked
1188%endif ; RT_ARCH_AMD64
1189
1190
1191;
1192; CMPXCHG8B.
1193;
1194; These are tricky register wise, so the code is duplicated for each calling
1195; convention.
1196;
1197; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1198;
1199; C-proto:
1200; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1201; uint32_t *pEFlags));
1202;
1203; Note! Identical to iemAImpl_cmpxchg16b.
1204;
1205BEGINCODE
1206BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1207%ifdef RT_ARCH_AMD64
1208 %ifdef ASM_CALL64_MSC
1209 push rbx
1210
1211 mov r11, rdx ; pu64EaxEdx (is also T1)
1212 mov r10, rcx ; pu64Dst
1213
1214 mov ebx, [r8]
1215 mov ecx, [r8 + 4]
1216 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1217 mov eax, [r11]
1218 mov edx, [r11 + 4]
1219
1220 lock cmpxchg8b [r10]
1221
1222 mov [r11], eax
1223 mov [r11 + 4], edx
1224 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1225
1226 pop rbx
1227 ret
1228 %else
1229 push rbx
1230
1231 mov r10, rcx ; pEFlags
1232 mov r11, rdx ; pu64EbxEcx (is also T1)
1233
1234 mov ebx, [r11]
1235 mov ecx, [r11 + 4]
1236 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1237 mov eax, [rsi]
1238 mov edx, [rsi + 4]
1239
1240 lock cmpxchg8b [rdi]
1241
1242 mov [rsi], eax
1243 mov [rsi + 4], edx
1244 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1245
1246 pop rbx
1247 ret
1248
1249 %endif
1250%else
1251 push esi
1252 push edi
1253 push ebx
1254 push ebp
1255
1256 mov edi, ecx ; pu64Dst
1257 mov esi, edx ; pu64EaxEdx
1258 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1259 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1260
1261 mov ebx, [ecx]
1262 mov ecx, [ecx + 4]
1263 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1264 mov eax, [esi]
1265 mov edx, [esi + 4]
1266
1267 lock cmpxchg8b [edi]
1268
1269 mov [esi], eax
1270 mov [esi + 4], edx
1271 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1272
1273 pop ebp
1274 pop ebx
1275 pop edi
1276 pop esi
1277 ret 8
1278%endif
1279ENDPROC iemAImpl_cmpxchg8b
1280
1281BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1282 ; Lazy bird always lock prefixes cmpxchg8b.
1283 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1284ENDPROC iemAImpl_cmpxchg8b_locked
1285
1286%ifdef RT_ARCH_AMD64
1287
1288;
1289; CMPXCHG16B.
1290;
1291; These are tricky register wise, so the code is duplicated for each calling
1292; convention.
1293;
1294; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1295;
1296; C-proto:
1297; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1298; uint32_t *pEFlags));
1299;
1300; Note! Identical to iemAImpl_cmpxchg8b.
1301;
1302BEGINCODE
1303BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1304 %ifdef ASM_CALL64_MSC
1305 push rbx
1306
1307 mov r11, rdx ; pu64RaxRdx (is also T1)
1308 mov r10, rcx ; pu64Dst
1309
1310 mov rbx, [r8]
1311 mov rcx, [r8 + 8]
1312 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1313 mov rax, [r11]
1314 mov rdx, [r11 + 8]
1315
1316 lock cmpxchg16b [r10]
1317
1318 mov [r11], rax
1319 mov [r11 + 8], rdx
1320 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1321
1322 pop rbx
1323 ret
1324 %else
1325 push rbx
1326
1327 mov r10, rcx ; pEFlags
1328 mov r11, rdx ; pu64RbxRcx (is also T1)
1329
1330 mov rbx, [r11]
1331 mov rcx, [r11 + 8]
1332 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1333 mov rax, [rsi]
1334 mov rdx, [rsi + 8]
1335
1336 lock cmpxchg16b [rdi]
1337
1338 mov [rsi], rax
1339 mov [rsi + 8], rdx
1340 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1341
1342 pop rbx
1343 ret
1344
1345 %endif
1346ENDPROC iemAImpl_cmpxchg16b
1347
1348BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1349 ; Lazy bird always lock prefixes cmpxchg16b.
1350 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1351ENDPROC iemAImpl_cmpxchg16b_locked
1352
1353%endif ; RT_ARCH_AMD64
1354
1355
1356;
1357; CMPXCHG.
1358;
1359; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1360;
1361; C-proto:
1362; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1363;
1364BEGINCODE
1365%macro IEMIMPL_CMPXCHG 2
1366BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1367 PROLOGUE_4_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1369 mov al, [A1]
1370 %1 cmpxchg [A0], A2_8
1371 mov [A1], al
1372 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1373 EPILOGUE_4_ARGS
1374ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1375
1376BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1377 PROLOGUE_4_ARGS
1378 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1379 mov ax, [A1]
1380 %1 cmpxchg [A0], A2_16
1381 mov [A1], ax
1382 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1383 EPILOGUE_4_ARGS
1384ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1385
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1387 PROLOGUE_4_ARGS
1388 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1389 mov eax, [A1]
1390 %1 cmpxchg [A0], A2_32
1391 mov [A1], eax
1392 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1393 EPILOGUE_4_ARGS
1394ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1395
1396BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1397%ifdef RT_ARCH_AMD64
1398 PROLOGUE_4_ARGS
1399 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1400 mov rax, [A1]
1401 %1 cmpxchg [A0], A2
1402 mov [A1], rax
1403 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1404 EPILOGUE_4_ARGS
1405%else
1406 ;
1407 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1408 ;
1409 push esi
1410 push edi
1411 push ebx
1412 push ebp
1413
1414 mov edi, ecx ; pu64Dst
1415 mov esi, edx ; pu64Rax
1416 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1417 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1418
1419 mov ebx, [ecx]
1420 mov ecx, [ecx + 4]
1421 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1422 mov eax, [esi]
1423 mov edx, [esi + 4]
1424
1425 lock cmpxchg8b [edi]
1426
1427 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1428 jz .cmpxchg8b_not_equal
1429 cmp eax, eax ; just set the other flags.
1430.store:
1431 mov [esi], eax
1432 mov [esi + 4], edx
1433 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1434
1435 pop ebp
1436 pop ebx
1437 pop edi
1438 pop esi
1439 ret 8
1440
1441.cmpxchg8b_not_equal:
1442 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1443 jne .store
1444 cmp [esi], eax
1445 jmp .store
1446
1447%endif
1448ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1449%endmacro ; IEMIMPL_CMPXCHG
1450
1451IEMIMPL_CMPXCHG , ,
1452IEMIMPL_CMPXCHG lock, _locked
1453
1454;;
1455; Macro for implementing a unary operator.
1456;
1457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1458; variants, except on 32-bit system where the 64-bit accesses requires hand
1459; coding.
1460;
1461; All the functions takes a pointer to the destination memory operand in A0,
1462; the source register operand in A1 and a pointer to eflags in A2.
1463;
1464; @param 1 The instruction mnemonic.
1465; @param 2 The modified flags.
1466; @param 3 The undefined flags.
1467;
1468%macro IEMIMPL_UNARY_OP 3
1469BEGINCODE
1470BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1471 PROLOGUE_2_ARGS
1472 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1473 %1 byte [A0]
1474 IEM_SAVE_FLAGS A1, %2, %3
1475 EPILOGUE_2_ARGS
1476ENDPROC iemAImpl_ %+ %1 %+ _u8
1477
1478BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1479 PROLOGUE_2_ARGS
1480 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1481 lock %1 byte [A0]
1482 IEM_SAVE_FLAGS A1, %2, %3
1483 EPILOGUE_2_ARGS
1484ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1485
1486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1487 PROLOGUE_2_ARGS
1488 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1489 %1 word [A0]
1490 IEM_SAVE_FLAGS A1, %2, %3
1491 EPILOGUE_2_ARGS
1492ENDPROC iemAImpl_ %+ %1 %+ _u16
1493
1494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1495 PROLOGUE_2_ARGS
1496 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1497 lock %1 word [A0]
1498 IEM_SAVE_FLAGS A1, %2, %3
1499 EPILOGUE_2_ARGS
1500ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1501
1502BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1503 PROLOGUE_2_ARGS
1504 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1505 %1 dword [A0]
1506 IEM_SAVE_FLAGS A1, %2, %3
1507 EPILOGUE_2_ARGS
1508ENDPROC iemAImpl_ %+ %1 %+ _u32
1509
1510BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1511 PROLOGUE_2_ARGS
1512 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1513 lock %1 dword [A0]
1514 IEM_SAVE_FLAGS A1, %2, %3
1515 EPILOGUE_2_ARGS
1516ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1517
1518 %ifdef RT_ARCH_AMD64
1519BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1520 PROLOGUE_2_ARGS
1521 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1522 %1 qword [A0]
1523 IEM_SAVE_FLAGS A1, %2, %3
1524 EPILOGUE_2_ARGS
1525ENDPROC iemAImpl_ %+ %1 %+ _u64
1526
1527BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1528 PROLOGUE_2_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1530 lock %1 qword [A0]
1531 IEM_SAVE_FLAGS A1, %2, %3
1532 EPILOGUE_2_ARGS
1533ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1534 %endif ; RT_ARCH_AMD64
1535
1536%endmacro
1537
1538IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1539IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1540IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1541IEMIMPL_UNARY_OP not, 0, 0
1542
1543
1544;
1545; BSWAP. No flag changes.
1546;
1547; Each function takes one argument, pointer to the value to bswap
1548; (input/output). They all return void.
1549;
1550BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1551 PROLOGUE_1_ARGS
1552 mov T0_32, [A0] ; just in case any of the upper bits are used.
1553 db 66h
1554 bswap T0_32
1555 mov [A0], T0_32
1556 EPILOGUE_1_ARGS
1557ENDPROC iemAImpl_bswap_u16
1558
1559BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1560 PROLOGUE_1_ARGS
1561 mov T0_32, [A0]
1562 bswap T0_32
1563 mov [A0], T0_32
1564 EPILOGUE_1_ARGS
1565ENDPROC iemAImpl_bswap_u32
1566
1567BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1568%ifdef RT_ARCH_AMD64
1569 PROLOGUE_1_ARGS
1570 mov T0, [A0]
1571 bswap T0
1572 mov [A0], T0
1573 EPILOGUE_1_ARGS
1574%else
1575 PROLOGUE_1_ARGS
1576 mov T0, [A0]
1577 mov T1, [A0 + 4]
1578 bswap T0
1579 bswap T1
1580 mov [A0 + 4], T0
1581 mov [A0], T1
1582 EPILOGUE_1_ARGS
1583%endif
1584ENDPROC iemAImpl_bswap_u64
1585
1586
1587;;
1588; Macro for implementing a shift operation.
1589;
1590; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1591; 32-bit system where the 64-bit accesses requires hand coding.
1592;
1593; All the functions takes a pointer to the destination memory operand in A0,
1594; the shift count in A1 and a pointer to eflags in A2.
1595;
1596; @param 1 The instruction mnemonic.
1597; @param 2 The modified flags.
1598; @param 3 The undefined flags.
1599;
1600; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1601;
1602; @note the _intel and _amd variants are implemented in C.
1603;
1604%macro IEMIMPL_SHIFT_OP 3
1605BEGINCODE
1606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1607 PROLOGUE_3_ARGS
1608 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1609 %ifdef ASM_CALL64_GCC
1610 mov cl, A1_8
1611 %1 byte [A0], cl
1612 %else
1613 xchg A1, A0
1614 %1 byte [A1], cl
1615 %endif
1616 IEM_SAVE_FLAGS A2, %2, %3
1617 EPILOGUE_3_ARGS
1618ENDPROC iemAImpl_ %+ %1 %+ _u8
1619
1620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1621 PROLOGUE_3_ARGS
1622 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1623 %ifdef ASM_CALL64_GCC
1624 mov cl, A1_8
1625 %1 word [A0], cl
1626 %else
1627 xchg A1, A0
1628 %1 word [A1], cl
1629 %endif
1630 IEM_SAVE_FLAGS A2, %2, %3
1631 EPILOGUE_3_ARGS
1632ENDPROC iemAImpl_ %+ %1 %+ _u16
1633
1634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1635 PROLOGUE_3_ARGS
1636 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1637 %ifdef ASM_CALL64_GCC
1638 mov cl, A1_8
1639 %1 dword [A0], cl
1640 %else
1641 xchg A1, A0
1642 %1 dword [A1], cl
1643 %endif
1644 IEM_SAVE_FLAGS A2, %2, %3
1645 EPILOGUE_3_ARGS
1646ENDPROC iemAImpl_ %+ %1 %+ _u32
1647
1648 %ifdef RT_ARCH_AMD64
1649BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1650 PROLOGUE_3_ARGS
1651 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1652 %ifdef ASM_CALL64_GCC
1653 mov cl, A1_8
1654 %1 qword [A0], cl
1655 %else
1656 xchg A1, A0
1657 %1 qword [A1], cl
1658 %endif
1659 IEM_SAVE_FLAGS A2, %2, %3
1660 EPILOGUE_3_ARGS
1661ENDPROC iemAImpl_ %+ %1 %+ _u64
1662 %endif ; RT_ARCH_AMD64
1663
1664%endmacro
1665
1666IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1667IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1668IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1669IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1670IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1671IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1672IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1673
1674
1675;;
1676; Macro for implementing a double precision shift operation.
1677;
1678; This will generate code for the 16, 32 and 64 bit accesses, except on
1679; 32-bit system where the 64-bit accesses requires hand coding.
1680;
1681; The functions takes the destination operand (r/m) in A0, the source (reg) in
1682; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1683;
1684; @param 1 The instruction mnemonic.
1685; @param 2 The modified flags.
1686; @param 3 The undefined flags.
1687;
1688; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1689;
1690; @note the _intel and _amd variants are implemented in C.
1691;
1692%macro IEMIMPL_SHIFT_DBL_OP 3
1693BEGINCODE
1694BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1695 PROLOGUE_4_ARGS
1696 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1697 %ifdef ASM_CALL64_GCC
1698 xchg A3, A2
1699 %1 [A0], A1_16, cl
1700 xchg A3, A2
1701 %else
1702 xchg A0, A2
1703 %1 [A2], A1_16, cl
1704 %endif
1705 IEM_SAVE_FLAGS A3, %2, %3
1706 EPILOGUE_4_ARGS
1707ENDPROC iemAImpl_ %+ %1 %+ _u16
1708
1709BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1710 PROLOGUE_4_ARGS
1711 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1712 %ifdef ASM_CALL64_GCC
1713 xchg A3, A2
1714 %1 [A0], A1_32, cl
1715 xchg A3, A2
1716 %else
1717 xchg A0, A2
1718 %1 [A2], A1_32, cl
1719 %endif
1720 IEM_SAVE_FLAGS A3, %2, %3
1721 EPILOGUE_4_ARGS
1722ENDPROC iemAImpl_ %+ %1 %+ _u32
1723
1724 %ifdef RT_ARCH_AMD64
1725BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1726 PROLOGUE_4_ARGS
1727 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1728 %ifdef ASM_CALL64_GCC
1729 xchg A3, A2
1730 %1 [A0], A1, cl
1731 xchg A3, A2
1732 %else
1733 xchg A0, A2
1734 %1 [A2], A1, cl
1735 %endif
1736 IEM_SAVE_FLAGS A3, %2, %3
1737 EPILOGUE_4_ARGS_EX 12
1738ENDPROC iemAImpl_ %+ %1 %+ _u64
1739 %endif ; RT_ARCH_AMD64
1740
1741%endmacro
1742
1743IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1744IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1745
1746
1747;;
1748; Macro for implementing a multiplication operations.
1749;
1750; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1751; 32-bit system where the 64-bit accesses requires hand coding.
1752;
1753; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1754; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1755; pointer to eflags in A3.
1756;
1757; The functions all return 0 so the caller can be used for div/idiv as well as
1758; for the mul/imul implementation.
1759;
1760; @param 1 The instruction mnemonic.
1761; @param 2 The modified flags.
1762; @param 3 The undefined flags.
1763; @param 4 Name suffix.
1764; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1765;
1766; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1767;
1768%macro IEMIMPL_MUL_OP 5
1769BEGINCODE
1770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1771 PROLOGUE_3_ARGS
1772 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1773 mov al, [A0]
1774 %1 A1_8
1775 mov [A0], ax
1776 %if %5 != 1
1777 IEM_SAVE_FLAGS A2, %2, %3
1778 %else
1779 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1780 %endif
1781 xor eax, eax
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1786 PROLOGUE_4_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1788 mov ax, [A0]
1789 %ifdef ASM_CALL64_GCC
1790 %1 A2_16
1791 mov [A0], ax
1792 mov [A1], dx
1793 %else
1794 mov T1, A1
1795 %1 A2_16
1796 mov [A0], ax
1797 mov [T1], dx
1798 %endif
1799 %if %5 != 1
1800 IEM_SAVE_FLAGS A3, %2, %3
1801 %else
1802 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1803 %endif
1804 xor eax, eax
1805 EPILOGUE_4_ARGS
1806ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1807
1808BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1809 PROLOGUE_4_ARGS
1810 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1811 mov eax, [A0]
1812 %ifdef ASM_CALL64_GCC
1813 %1 A2_32
1814 mov [A0], eax
1815 mov [A1], edx
1816 %else
1817 mov T1, A1
1818 %1 A2_32
1819 mov [A0], eax
1820 mov [T1], edx
1821 %endif
1822 %if %5 != 1
1823 IEM_SAVE_FLAGS A3, %2, %3
1824 %else
1825 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1826 %endif
1827 xor eax, eax
1828 EPILOGUE_4_ARGS
1829ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1830
1831 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1832BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1833 PROLOGUE_4_ARGS
1834 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1835 mov rax, [A0]
1836 %ifdef ASM_CALL64_GCC
1837 %1 A2
1838 mov [A0], rax
1839 mov [A1], rdx
1840 %else
1841 mov T1, A1
1842 %1 A2
1843 mov [A0], rax
1844 mov [T1], rdx
1845 %endif
1846 %if %5 != 1
1847 IEM_SAVE_FLAGS A3, %2, %3
1848 %else
1849 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
1850 %endif
1851 xor eax, eax
1852 EPILOGUE_4_ARGS_EX 12
1853ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1854 %endif ; !RT_ARCH_AMD64
1855
1856%endmacro
1857
1858IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
1859IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
1860IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
1861IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
1862IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
1863IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
1864
1865
1866BEGINCODE
1867;;
1868; Worker function for negating a 32-bit number in T1:T0
1869; @uses None (T0,T1)
1870BEGINPROC iemAImpl_negate_T0_T1_u32
1871 push 0
1872 push 0
1873 xchg T0_32, [xSP]
1874 xchg T1_32, [xSP + xCB]
1875 sub T0_32, [xSP]
1876 sbb T1_32, [xSP + xCB]
1877 add xSP, xCB*2
1878 ret
1879ENDPROC iemAImpl_negate_T0_T1_u32
1880
1881%ifdef RT_ARCH_AMD64
1882;;
1883; Worker function for negating a 64-bit number in T1:T0
1884; @uses None (T0,T1)
1885BEGINPROC iemAImpl_negate_T0_T1_u64
1886 push 0
1887 push 0
1888 xchg T0, [xSP]
1889 xchg T1, [xSP + xCB]
1890 sub T0, [xSP]
1891 sbb T1, [xSP + xCB]
1892 add xSP, xCB*2
1893 ret
1894ENDPROC iemAImpl_negate_T0_T1_u64
1895%endif
1896
1897
1898;;
1899; Macro for implementing a division operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 on success and -1 if a divide error should be
1909; raised by the caller.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 1 if signed, 0 if unsigned.
1915; @param 5 Function suffix.
1916; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
1917; 2 for AMD (set AF, clear PF, ZF and SF).
1918;
1919; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1920;
1921%macro IEMIMPL_DIV_OP 6
1922BEGINCODE
1923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
1924 PROLOGUE_3_ARGS
1925
1926 ; div by chainsaw check.
1927 test A1_8, A1_8
1928 jz .div_zero
1929
1930 ; Overflow check - unsigned division is simple to verify, haven't
1931 ; found a simple way to check signed division yet unfortunately.
1932 %if %4 == 0
1933 cmp [A0 + 1], A1_8
1934 jae .div_overflow
1935 %else
1936 mov T0_16, [A0] ; T0 = dividend
1937 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1938 test A1_8, A1_8
1939 js .divisor_negative
1940 test T0_16, T0_16
1941 jns .both_positive
1942 neg T0_16
1943.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1944 push T0 ; Start off like unsigned below.
1945 shr T0_16, 7
1946 cmp T0_8, A1_8
1947 pop T0
1948 jb .div_no_overflow
1949 ja .div_overflow
1950 and T0_8, 0x7f ; Special case for covering (divisor - 1).
1951 cmp T0_8, A1_8
1952 jae .div_overflow
1953 jmp .div_no_overflow
1954
1955.divisor_negative:
1956 neg A1_8
1957 test T0_16, T0_16
1958 jns .one_of_each
1959 neg T0_16
1960.both_positive: ; Same as unsigned shifted by sign indicator bit.
1961 shr T0_16, 7
1962 cmp T0_8, A1_8
1963 jae .div_overflow
1964.div_no_overflow:
1965 mov A1, T1 ; restore divisor
1966 %endif
1967
1968 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1969 mov ax, [A0]
1970 %1 A1_8
1971 mov [A0], ax
1972 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1973 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
1974 %else
1975 IEM_SAVE_FLAGS A2, %2, %3
1976 %endif
1977 xor eax, eax
1978
1979.return:
1980 EPILOGUE_3_ARGS
1981
1982.div_zero:
1983.div_overflow:
1984 mov eax, -1
1985 jmp .return
1986ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
1987
1988BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
1989 PROLOGUE_4_ARGS
1990
1991 ; div by chainsaw check.
1992 test A2_16, A2_16
1993 jz .div_zero
1994
1995 ; Overflow check - unsigned division is simple to verify, haven't
1996 ; found a simple way to check signed division yet unfortunately.
1997 %if %4 == 0
1998 cmp [A1], A2_16
1999 jae .div_overflow
2000 %else
2001 mov T0_16, [A1]
2002 shl T0_32, 16
2003 mov T0_16, [A0] ; T0 = dividend
2004 mov T1, A2 ; T1 = divisor
2005 test T1_16, T1_16
2006 js .divisor_negative
2007 test T0_32, T0_32
2008 jns .both_positive
2009 neg T0_32
2010.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2011 push T0 ; Start off like unsigned below.
2012 shr T0_32, 15
2013 cmp T0_16, T1_16
2014 pop T0
2015 jb .div_no_overflow
2016 ja .div_overflow
2017 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2018 cmp T0_16, T1_16
2019 jae .div_overflow
2020 jmp .div_no_overflow
2021
2022.divisor_negative:
2023 neg T1_16
2024 test T0_32, T0_32
2025 jns .one_of_each
2026 neg T0_32
2027.both_positive: ; Same as unsigned shifted by sign indicator bit.
2028 shr T0_32, 15
2029 cmp T0_16, T1_16
2030 jae .div_overflow
2031.div_no_overflow:
2032 %endif
2033
2034 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2035 %ifdef ASM_CALL64_GCC
2036 mov T1, A2
2037 mov ax, [A0]
2038 mov dx, [A1]
2039 %1 T1_16
2040 mov [A0], ax
2041 mov [A1], dx
2042 %else
2043 mov T1, A1
2044 mov ax, [A0]
2045 mov dx, [T1]
2046 %1 A2_16
2047 mov [A0], ax
2048 mov [T1], dx
2049 %endif
2050 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2051 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2052 %else
2053 IEM_SAVE_FLAGS A3, %2, %3
2054 %endif
2055 xor eax, eax
2056
2057.return:
2058 EPILOGUE_4_ARGS
2059
2060.div_zero:
2061.div_overflow:
2062 mov eax, -1
2063 jmp .return
2064ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2065
2066BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2067 PROLOGUE_4_ARGS
2068
2069 ; div by chainsaw check.
2070 test A2_32, A2_32
2071 jz .div_zero
2072
2073 ; Overflow check - unsigned division is simple to verify, haven't
2074 ; found a simple way to check signed division yet unfortunately.
2075 %if %4 == 0
2076 cmp [A1], A2_32
2077 jae .div_overflow
2078 %else
2079 push A2 ; save A2 so we modify it (we out of regs on x86).
2080 mov T0_32, [A0] ; T0 = dividend low
2081 mov T1_32, [A1] ; T1 = dividend high
2082 test A2_32, A2_32
2083 js .divisor_negative
2084 test T1_32, T1_32
2085 jns .both_positive
2086 call NAME(iemAImpl_negate_T0_T1_u32)
2087.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2088 push T0 ; Start off like unsigned below.
2089 shl T1_32, 1
2090 shr T0_32, 31
2091 or T1_32, T0_32
2092 cmp T1_32, A2_32
2093 pop T0
2094 jb .div_no_overflow
2095 ja .div_overflow
2096 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2097 cmp T0_32, A2_32
2098 jae .div_overflow
2099 jmp .div_no_overflow
2100
2101.divisor_negative:
2102 neg A2_32
2103 test T1_32, T1_32
2104 jns .one_of_each
2105 call NAME(iemAImpl_negate_T0_T1_u32)
2106.both_positive: ; Same as unsigned shifted by sign indicator bit.
2107 shl T1_32, 1
2108 shr T0_32, 31
2109 or T1_32, T0_32
2110 cmp T1_32, A2_32
2111 jae .div_overflow
2112.div_no_overflow:
2113 pop A2
2114 %endif
2115
2116 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2117 mov eax, [A0]
2118 %ifdef ASM_CALL64_GCC
2119 mov T1, A2
2120 mov eax, [A0]
2121 mov edx, [A1]
2122 %1 T1_32
2123 mov [A0], eax
2124 mov [A1], edx
2125 %else
2126 mov T1, A1
2127 mov eax, [A0]
2128 mov edx, [T1]
2129 %1 A2_32
2130 mov [A0], eax
2131 mov [T1], edx
2132 %endif
2133 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2134 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2135 %else
2136 IEM_SAVE_FLAGS A3, %2, %3
2137 %endif
2138 xor eax, eax
2139
2140.return:
2141 EPILOGUE_4_ARGS
2142
2143.div_overflow:
2144 %if %4 != 0
2145 pop A2
2146 %endif
2147.div_zero:
2148 mov eax, -1
2149 jmp .return
2150ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2151
2152 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2153BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2154 PROLOGUE_4_ARGS
2155
2156 test A2, A2
2157 jz .div_zero
2158 %if %4 == 0
2159 cmp [A1], A2
2160 jae .div_overflow
2161 %else
2162 push A2 ; save A2 so we modify it (we out of regs on x86).
2163 mov T0, [A0] ; T0 = dividend low
2164 mov T1, [A1] ; T1 = dividend high
2165 test A2, A2
2166 js .divisor_negative
2167 test T1, T1
2168 jns .both_positive
2169 call NAME(iemAImpl_negate_T0_T1_u64)
2170.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2171 push T0 ; Start off like unsigned below.
2172 shl T1, 1
2173 shr T0, 63
2174 or T1, T0
2175 cmp T1, A2
2176 pop T0
2177 jb .div_no_overflow
2178 ja .div_overflow
2179 mov T1, 0x7fffffffffffffff
2180 and T0, T1 ; Special case for covering (divisor - 1).
2181 cmp T0, A2
2182 jae .div_overflow
2183 jmp .div_no_overflow
2184
2185.divisor_negative:
2186 neg A2
2187 test T1, T1
2188 jns .one_of_each
2189 call NAME(iemAImpl_negate_T0_T1_u64)
2190.both_positive: ; Same as unsigned shifted by sign indicator bit.
2191 shl T1, 1
2192 shr T0, 63
2193 or T1, T0
2194 cmp T1, A2
2195 jae .div_overflow
2196.div_no_overflow:
2197 pop A2
2198 %endif
2199
2200 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2201 mov rax, [A0]
2202 %ifdef ASM_CALL64_GCC
2203 mov T1, A2
2204 mov rax, [A0]
2205 mov rdx, [A1]
2206 %1 T1
2207 mov [A0], rax
2208 mov [A1], rdx
2209 %else
2210 mov T1, A1
2211 mov rax, [A0]
2212 mov rdx, [T1]
2213 %1 A2
2214 mov [A0], rax
2215 mov [T1], rdx
2216 %endif
2217 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2218 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2219 %else
2220 IEM_SAVE_FLAGS A3, %2, %3
2221 %endif
2222 xor eax, eax
2223
2224.return:
2225 EPILOGUE_4_ARGS_EX 12
2226
2227.div_overflow:
2228 %if %4 != 0
2229 pop A2
2230 %endif
2231.div_zero:
2232 mov eax, -1
2233 jmp .return
2234ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2235 %endif ; !RT_ARCH_AMD64
2236
2237%endmacro
2238
2239IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2240IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2241IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2242IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2243IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2244IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2245
2246
2247;;
2248; Macro for implementing memory fence operation.
2249;
2250; No return value, no operands or anything.
2251;
2252; @param 1 The instruction.
2253;
2254%macro IEMIMPL_MEM_FENCE 1
2255BEGINCODE
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2257 %1
2258 ret
2259ENDPROC iemAImpl_ %+ %1
2260%endmacro
2261
2262IEMIMPL_MEM_FENCE lfence
2263IEMIMPL_MEM_FENCE sfence
2264IEMIMPL_MEM_FENCE mfence
2265
2266;;
2267; Alternative for non-SSE2 host.
2268;
2269BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2270 push xAX
2271 xchg xAX, [xSP]
2272 add xSP, xCB
2273 ret
2274ENDPROC iemAImpl_alt_mem_fence
2275
2276
2277;;
2278; Initialize the FPU for the actual instruction being emulated, this means
2279; loading parts of the guest's control word and status word.
2280;
2281; @uses 24 bytes of stack. T0, T1
2282; @param 1 Expression giving the address of the FXSTATE of the guest.
2283;
2284%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2285 fnstenv [xSP]
2286
2287 ; FCW - for exception, precision and rounding control.
2288 movzx T0, word [%1 + X86FXSTATE.FCW]
2289 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2290 mov [xSP + X86FSTENV32P.FCW], T0_16
2291
2292 ; FSW - for undefined C0, C1, C2, and C3.
2293 movzx T1, word [%1 + X86FXSTATE.FSW]
2294 and T1, X86_FSW_C_MASK
2295 movzx T0, word [xSP + X86FSTENV32P.FSW]
2296 and T0, X86_FSW_TOP_MASK
2297 or T0, T1
2298 mov [xSP + X86FSTENV32P.FSW], T0_16
2299
2300 fldenv [xSP]
2301%endmacro
2302
2303
2304;;
2305; Initialize the FPU for the actual instruction being emulated, this means
2306; loading parts of the guest's control word, status word, and update the
2307; tag word for the top register if it's empty.
2308;
2309; ASSUMES actual TOP=7
2310;
2311; @uses 24 bytes of stack. T0, T1
2312; @param 1 Expression giving the address of the FXSTATE of the guest.
2313;
2314%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2315 fnstenv [xSP]
2316
2317 ; FCW - for exception, precision and rounding control.
2318 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2319 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2320 mov [xSP + X86FSTENV32P.FCW], T0_16
2321
2322 ; FSW - for undefined C0, C1, C2, and C3.
2323 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2324 and T1_32, X86_FSW_C_MASK
2325 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2326 and T0_32, X86_FSW_TOP_MASK
2327 or T0_32, T1_32
2328 mov [xSP + X86FSTENV32P.FSW], T0_16
2329
2330 ; FTW - Only for ST0 (in/out).
2331 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2332 shr T1_32, X86_FSW_TOP_SHIFT
2333 and T1_32, X86_FSW_TOP_SMASK
2334 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2335 jc %%st0_not_empty
2336 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2337%%st0_not_empty:
2338
2339 fldenv [xSP]
2340%endmacro
2341
2342
2343;;
2344; Need to move this as well somewhere better?
2345;
2346struc IEMFPURESULT
2347 .r80Result resw 5
2348 .FSW resw 1
2349endstruc
2350
2351
2352;;
2353; Need to move this as well somewhere better?
2354;
2355struc IEMFPURESULTTWO
2356 .r80Result1 resw 5
2357 .FSW resw 1
2358 .r80Result2 resw 5
2359endstruc
2360
2361
2362;
2363;---------------------- 16-bit signed integer operations ----------------------
2364;
2365
2366
2367;;
2368; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2369;
2370; @param A0 FPU context (fxsave).
2371; @param A1 Pointer to a IEMFPURESULT for the output.
2372; @param A2 Pointer to the 16-bit floating point value to convert.
2373;
2374BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2375 PROLOGUE_3_ARGS
2376 sub xSP, 20h
2377
2378 fninit
2379 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2380 fild word [A2]
2381
2382 fnstsw word [A1 + IEMFPURESULT.FSW]
2383 fnclex
2384 fstp tword [A1 + IEMFPURESULT.r80Result]
2385
2386 fninit
2387 add xSP, 20h
2388 EPILOGUE_3_ARGS
2389ENDPROC iemAImpl_fild_r80_from_i16
2390
2391
2392;;
2393; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2394;
2395; @param A0 FPU context (fxsave).
2396; @param A1 Where to return the output FSW.
2397; @param A2 Where to store the 16-bit signed integer value.
2398; @param A3 Pointer to the 80-bit value.
2399;
2400BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2401 PROLOGUE_4_ARGS
2402 sub xSP, 20h
2403
2404 fninit
2405 fld tword [A3]
2406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2407 fistp word [A2]
2408
2409 fnstsw word [A1]
2410
2411 fninit
2412 add xSP, 20h
2413 EPILOGUE_4_ARGS
2414ENDPROC iemAImpl_fist_r80_to_i16
2415
2416
2417;;
2418; Store a 80-bit floating point value (register) as a 16-bit signed integer
2419; (memory) with truncation.
2420;
2421; @param A0 FPU context (fxsave).
2422; @param A1 Where to return the output FSW.
2423; @param A2 Where to store the 16-bit signed integer value.
2424; @param A3 Pointer to the 80-bit value.
2425;
2426BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2427 PROLOGUE_4_ARGS
2428 sub xSP, 20h
2429
2430 fninit
2431 fld tword [A3]
2432 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2433 fisttp word [A2]
2434
2435 fnstsw word [A1]
2436
2437 fninit
2438 add xSP, 20h
2439 EPILOGUE_4_ARGS
2440ENDPROC iemAImpl_fistt_r80_to_i16
2441
2442
2443;;
2444; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2445;
2446; @param 1 The instruction
2447;
2448; @param A0 FPU context (fxsave).
2449; @param A1 Pointer to a IEMFPURESULT for the output.
2450; @param A2 Pointer to the 80-bit value.
2451; @param A3 Pointer to the 16-bit value.
2452;
2453%macro IEMIMPL_FPU_R80_BY_I16 1
2454BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2455 PROLOGUE_4_ARGS
2456 sub xSP, 20h
2457
2458 fninit
2459 fld tword [A2]
2460 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2461 %1 word [A3]
2462
2463 fnstsw word [A1 + IEMFPURESULT.FSW]
2464 fnclex
2465 fstp tword [A1 + IEMFPURESULT.r80Result]
2466
2467 fninit
2468 add xSP, 20h
2469 EPILOGUE_4_ARGS
2470ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2471%endmacro
2472
2473IEMIMPL_FPU_R80_BY_I16 fiadd
2474IEMIMPL_FPU_R80_BY_I16 fimul
2475IEMIMPL_FPU_R80_BY_I16 fisub
2476IEMIMPL_FPU_R80_BY_I16 fisubr
2477IEMIMPL_FPU_R80_BY_I16 fidiv
2478IEMIMPL_FPU_R80_BY_I16 fidivr
2479
2480
2481;;
2482; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2483; only returning FSW.
2484;
2485; @param 1 The instruction
2486;
2487; @param A0 FPU context (fxsave).
2488; @param A1 Where to store the output FSW.
2489; @param A2 Pointer to the 80-bit value.
2490; @param A3 Pointer to the 64-bit value.
2491;
2492%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2493BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2494 PROLOGUE_4_ARGS
2495 sub xSP, 20h
2496
2497 fninit
2498 fld tword [A2]
2499 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2500 %1 word [A3]
2501
2502 fnstsw word [A1]
2503
2504 fninit
2505 add xSP, 20h
2506 EPILOGUE_4_ARGS
2507ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2508%endmacro
2509
2510IEMIMPL_FPU_R80_BY_I16_FSW ficom
2511
2512
2513
2514;
2515;---------------------- 32-bit signed integer operations ----------------------
2516;
2517
2518
2519;;
2520; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2521;
2522; @param A0 FPU context (fxsave).
2523; @param A1 Pointer to a IEMFPURESULT for the output.
2524; @param A2 Pointer to the 32-bit floating point value to convert.
2525;
2526BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2527 PROLOGUE_3_ARGS
2528 sub xSP, 20h
2529
2530 fninit
2531 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2532 fild dword [A2]
2533
2534 fnstsw word [A1 + IEMFPURESULT.FSW]
2535 fnclex
2536 fstp tword [A1 + IEMFPURESULT.r80Result]
2537
2538 fninit
2539 add xSP, 20h
2540 EPILOGUE_3_ARGS
2541ENDPROC iemAImpl_fild_r80_from_i32
2542
2543
2544;;
2545; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2546;
2547; @param A0 FPU context (fxsave).
2548; @param A1 Where to return the output FSW.
2549; @param A2 Where to store the 32-bit signed integer value.
2550; @param A3 Pointer to the 80-bit value.
2551;
2552BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2553 PROLOGUE_4_ARGS
2554 sub xSP, 20h
2555
2556 fninit
2557 fld tword [A3]
2558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2559 fistp dword [A2]
2560
2561 fnstsw word [A1]
2562
2563 fninit
2564 add xSP, 20h
2565 EPILOGUE_4_ARGS
2566ENDPROC iemAImpl_fist_r80_to_i32
2567
2568
2569;;
2570; Store a 80-bit floating point value (register) as a 32-bit signed integer
2571; (memory) with truncation.
2572;
2573; @param A0 FPU context (fxsave).
2574; @param A1 Where to return the output FSW.
2575; @param A2 Where to store the 32-bit signed integer value.
2576; @param A3 Pointer to the 80-bit value.
2577;
2578BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2579 PROLOGUE_4_ARGS
2580 sub xSP, 20h
2581
2582 fninit
2583 fld tword [A3]
2584 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2585 fisttp dword [A2]
2586
2587 fnstsw word [A1]
2588
2589 fninit
2590 add xSP, 20h
2591 EPILOGUE_4_ARGS
2592ENDPROC iemAImpl_fistt_r80_to_i32
2593
2594
2595;;
2596; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2597;
2598; @param 1 The instruction
2599;
2600; @param A0 FPU context (fxsave).
2601; @param A1 Pointer to a IEMFPURESULT for the output.
2602; @param A2 Pointer to the 80-bit value.
2603; @param A3 Pointer to the 32-bit value.
2604;
2605%macro IEMIMPL_FPU_R80_BY_I32 1
2606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2607 PROLOGUE_4_ARGS
2608 sub xSP, 20h
2609
2610 fninit
2611 fld tword [A2]
2612 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2613 %1 dword [A3]
2614
2615 fnstsw word [A1 + IEMFPURESULT.FSW]
2616 fnclex
2617 fstp tword [A1 + IEMFPURESULT.r80Result]
2618
2619 fninit
2620 add xSP, 20h
2621 EPILOGUE_4_ARGS
2622ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2623%endmacro
2624
2625IEMIMPL_FPU_R80_BY_I32 fiadd
2626IEMIMPL_FPU_R80_BY_I32 fimul
2627IEMIMPL_FPU_R80_BY_I32 fisub
2628IEMIMPL_FPU_R80_BY_I32 fisubr
2629IEMIMPL_FPU_R80_BY_I32 fidiv
2630IEMIMPL_FPU_R80_BY_I32 fidivr
2631
2632
2633;;
2634; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2635; only returning FSW.
2636;
2637; @param 1 The instruction
2638;
2639; @param A0 FPU context (fxsave).
2640; @param A1 Where to store the output FSW.
2641; @param A2 Pointer to the 80-bit value.
2642; @param A3 Pointer to the 64-bit value.
2643;
2644%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2646 PROLOGUE_4_ARGS
2647 sub xSP, 20h
2648
2649 fninit
2650 fld tword [A2]
2651 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2652 %1 dword [A3]
2653
2654 fnstsw word [A1]
2655
2656 fninit
2657 add xSP, 20h
2658 EPILOGUE_4_ARGS
2659ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2660%endmacro
2661
2662IEMIMPL_FPU_R80_BY_I32_FSW ficom
2663
2664
2665
2666;
2667;---------------------- 64-bit signed integer operations ----------------------
2668;
2669
2670
2671;;
2672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2673;
2674; @param A0 FPU context (fxsave).
2675; @param A1 Pointer to a IEMFPURESULT for the output.
2676; @param A2 Pointer to the 64-bit floating point value to convert.
2677;
2678BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2679 PROLOGUE_3_ARGS
2680 sub xSP, 20h
2681
2682 fninit
2683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2684 fild qword [A2]
2685
2686 fnstsw word [A1 + IEMFPURESULT.FSW]
2687 fnclex
2688 fstp tword [A1 + IEMFPURESULT.r80Result]
2689
2690 fninit
2691 add xSP, 20h
2692 EPILOGUE_3_ARGS
2693ENDPROC iemAImpl_fild_r80_from_i64
2694
2695
2696;;
2697; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2698;
2699; @param A0 FPU context (fxsave).
2700; @param A1 Where to return the output FSW.
2701; @param A2 Where to store the 64-bit signed integer value.
2702; @param A3 Pointer to the 80-bit value.
2703;
2704BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2705 PROLOGUE_4_ARGS
2706 sub xSP, 20h
2707
2708 fninit
2709 fld tword [A3]
2710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2711 fistp qword [A2]
2712
2713 fnstsw word [A1]
2714
2715 fninit
2716 add xSP, 20h
2717 EPILOGUE_4_ARGS
2718ENDPROC iemAImpl_fist_r80_to_i64
2719
2720
2721;;
2722; Store a 80-bit floating point value (register) as a 64-bit signed integer
2723; (memory) with truncation.
2724;
2725; @param A0 FPU context (fxsave).
2726; @param A1 Where to return the output FSW.
2727; @param A2 Where to store the 64-bit signed integer value.
2728; @param A3 Pointer to the 80-bit value.
2729;
2730BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2731 PROLOGUE_4_ARGS
2732 sub xSP, 20h
2733
2734 fninit
2735 fld tword [A3]
2736 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2737 fisttp qword [A2]
2738
2739 fnstsw word [A1]
2740
2741 fninit
2742 add xSP, 20h
2743 EPILOGUE_4_ARGS
2744ENDPROC iemAImpl_fistt_r80_to_i64
2745
2746
2747
2748;
2749;---------------------- 32-bit floating point operations ----------------------
2750;
2751
2752;;
2753; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2754;
2755; @param A0 FPU context (fxsave).
2756; @param A1 Pointer to a IEMFPURESULT for the output.
2757; @param A2 Pointer to the 32-bit floating point value to convert.
2758;
2759BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2760 PROLOGUE_3_ARGS
2761 sub xSP, 20h
2762
2763 fninit
2764 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2765 fld dword [A2]
2766
2767 fnstsw word [A1 + IEMFPURESULT.FSW]
2768 fnclex
2769 fstp tword [A1 + IEMFPURESULT.r80Result]
2770
2771 fninit
2772 add xSP, 20h
2773 EPILOGUE_3_ARGS
2774ENDPROC iemAImpl_fld_r80_from_r32
2775
2776
2777;;
2778; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2779;
2780; @param A0 FPU context (fxsave).
2781; @param A1 Where to return the output FSW.
2782; @param A2 Where to store the 32-bit value.
2783; @param A3 Pointer to the 80-bit value.
2784;
2785BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2786 PROLOGUE_4_ARGS
2787 sub xSP, 20h
2788
2789 fninit
2790 fld tword [A3]
2791 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2792 fst dword [A2]
2793
2794 fnstsw word [A1]
2795
2796 fninit
2797 add xSP, 20h
2798 EPILOGUE_4_ARGS
2799ENDPROC iemAImpl_fst_r80_to_r32
2800
2801
2802;;
2803; FPU instruction working on one 80-bit and one 32-bit floating point value.
2804;
2805; @param 1 The instruction
2806;
2807; @param A0 FPU context (fxsave).
2808; @param A1 Pointer to a IEMFPURESULT for the output.
2809; @param A2 Pointer to the 80-bit value.
2810; @param A3 Pointer to the 32-bit value.
2811;
2812%macro IEMIMPL_FPU_R80_BY_R32 1
2813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2814 PROLOGUE_4_ARGS
2815 sub xSP, 20h
2816
2817 fninit
2818 fld tword [A2]
2819 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2820 %1 dword [A3]
2821
2822 fnstsw word [A1 + IEMFPURESULT.FSW]
2823 fnclex
2824 fstp tword [A1 + IEMFPURESULT.r80Result]
2825
2826 fninit
2827 add xSP, 20h
2828 EPILOGUE_4_ARGS
2829ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2830%endmacro
2831
2832IEMIMPL_FPU_R80_BY_R32 fadd
2833IEMIMPL_FPU_R80_BY_R32 fmul
2834IEMIMPL_FPU_R80_BY_R32 fsub
2835IEMIMPL_FPU_R80_BY_R32 fsubr
2836IEMIMPL_FPU_R80_BY_R32 fdiv
2837IEMIMPL_FPU_R80_BY_R32 fdivr
2838
2839
2840;;
2841; FPU instruction working on one 80-bit and one 32-bit floating point value,
2842; only returning FSW.
2843;
2844; @param 1 The instruction
2845;
2846; @param A0 FPU context (fxsave).
2847; @param A1 Where to store the output FSW.
2848; @param A2 Pointer to the 80-bit value.
2849; @param A3 Pointer to the 64-bit value.
2850;
2851%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2852BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2853 PROLOGUE_4_ARGS
2854 sub xSP, 20h
2855
2856 fninit
2857 fld tword [A2]
2858 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2859 %1 dword [A3]
2860
2861 fnstsw word [A1]
2862
2863 fninit
2864 add xSP, 20h
2865 EPILOGUE_4_ARGS
2866ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2867%endmacro
2868
2869IEMIMPL_FPU_R80_BY_R32_FSW fcom
2870
2871
2872
2873;
2874;---------------------- 64-bit floating point operations ----------------------
2875;
2876
2877;;
2878; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2879;
2880; @param A0 FPU context (fxsave).
2881; @param A1 Pointer to a IEMFPURESULT for the output.
2882; @param A2 Pointer to the 64-bit floating point value to convert.
2883;
2884BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
2885 PROLOGUE_3_ARGS
2886 sub xSP, 20h
2887
2888 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2889 fld qword [A2]
2890
2891 fnstsw word [A1 + IEMFPURESULT.FSW]
2892 fnclex
2893 fstp tword [A1 + IEMFPURESULT.r80Result]
2894
2895 fninit
2896 add xSP, 20h
2897 EPILOGUE_3_ARGS
2898ENDPROC iemAImpl_fld_r80_from_r64
2899
2900
2901;;
2902; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2903;
2904; @param A0 FPU context (fxsave).
2905; @param A1 Where to return the output FSW.
2906; @param A2 Where to store the 64-bit value.
2907; @param A3 Pointer to the 80-bit value.
2908;
2909BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2910 PROLOGUE_4_ARGS
2911 sub xSP, 20h
2912
2913 fninit
2914 fld tword [A3]
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fst qword [A2]
2917
2918 fnstsw word [A1]
2919
2920 fninit
2921 add xSP, 20h
2922 EPILOGUE_4_ARGS
2923ENDPROC iemAImpl_fst_r80_to_r64
2924
2925
2926;;
2927; FPU instruction working on one 80-bit and one 64-bit floating point value.
2928;
2929; @param 1 The instruction
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Pointer to a IEMFPURESULT for the output.
2933; @param A2 Pointer to the 80-bit value.
2934; @param A3 Pointer to the 64-bit value.
2935;
2936%macro IEMIMPL_FPU_R80_BY_R64 1
2937BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2938 PROLOGUE_4_ARGS
2939 sub xSP, 20h
2940
2941 fninit
2942 fld tword [A2]
2943 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2944 %1 qword [A3]
2945
2946 fnstsw word [A1 + IEMFPURESULT.FSW]
2947 fnclex
2948 fstp tword [A1 + IEMFPURESULT.r80Result]
2949
2950 fninit
2951 add xSP, 20h
2952 EPILOGUE_4_ARGS
2953ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2954%endmacro
2955
2956IEMIMPL_FPU_R80_BY_R64 fadd
2957IEMIMPL_FPU_R80_BY_R64 fmul
2958IEMIMPL_FPU_R80_BY_R64 fsub
2959IEMIMPL_FPU_R80_BY_R64 fsubr
2960IEMIMPL_FPU_R80_BY_R64 fdiv
2961IEMIMPL_FPU_R80_BY_R64 fdivr
2962
2963;;
2964; FPU instruction working on one 80-bit and one 64-bit floating point value,
2965; only returning FSW.
2966;
2967; @param 1 The instruction
2968;
2969; @param A0 FPU context (fxsave).
2970; @param A1 Where to store the output FSW.
2971; @param A2 Pointer to the 80-bit value.
2972; @param A3 Pointer to the 64-bit value.
2973;
2974%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2975BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2976 PROLOGUE_4_ARGS
2977 sub xSP, 20h
2978
2979 fninit
2980 fld tword [A2]
2981 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2982 %1 qword [A3]
2983
2984 fnstsw word [A1]
2985
2986 fninit
2987 add xSP, 20h
2988 EPILOGUE_4_ARGS
2989ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2990%endmacro
2991
2992IEMIMPL_FPU_R80_BY_R64_FSW fcom
2993
2994
2995
2996;
2997;---------------------- 80-bit floating point operations ----------------------
2998;
2999
3000;;
3001; Loads a 80-bit floating point register value from memory.
3002;
3003; @param A0 FPU context (fxsave).
3004; @param A1 Pointer to a IEMFPURESULT for the output.
3005; @param A2 Pointer to the 80-bit floating point value to load.
3006;
3007BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3008 PROLOGUE_3_ARGS
3009 sub xSP, 20h
3010
3011 fninit
3012 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013 fld tword [A2]
3014
3015 fnstsw word [A1 + IEMFPURESULT.FSW]
3016 fnclex
3017 fstp tword [A1 + IEMFPURESULT.r80Result]
3018
3019 fninit
3020 add xSP, 20h
3021 EPILOGUE_3_ARGS
3022ENDPROC iemAImpl_fld_r80_from_r80
3023
3024
3025;;
3026; Store a 80-bit floating point register to memory
3027;
3028; @param A0 FPU context (fxsave).
3029; @param A1 Where to return the output FSW.
3030; @param A2 Where to store the 80-bit value.
3031; @param A3 Pointer to the 80-bit register value.
3032;
3033BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3034 PROLOGUE_4_ARGS
3035 sub xSP, 20h
3036
3037 fninit
3038 fld tword [A3]
3039 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3040 fstp tword [A2]
3041
3042 fnstsw word [A1]
3043
3044 fninit
3045 add xSP, 20h
3046 EPILOGUE_4_ARGS
3047ENDPROC iemAImpl_fst_r80_to_r80
3048
3049
3050;;
3051; Loads an 80-bit floating point register value in BCD format from memory.
3052;
3053; @param A0 FPU context (fxsave).
3054; @param A1 Pointer to a IEMFPURESULT for the output.
3055; @param A2 Pointer to the 80-bit BCD value to load.
3056;
3057BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3058 PROLOGUE_3_ARGS
3059 sub xSP, 20h
3060
3061 fninit
3062 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3063 fbld tword [A2]
3064
3065 fnstsw word [A1 + IEMFPURESULT.FSW]
3066 fnclex
3067 fstp tword [A1 + IEMFPURESULT.r80Result]
3068
3069 fninit
3070 add xSP, 20h
3071 EPILOGUE_3_ARGS
3072ENDPROC iemAImpl_fld_r80_from_d80
3073
3074
3075;;
3076; Store a 80-bit floating point register to memory as BCD
3077;
3078; @param A0 FPU context (fxsave).
3079; @param A1 Where to return the output FSW.
3080; @param A2 Where to store the 80-bit BCD value.
3081; @param A3 Pointer to the 80-bit register value.
3082;
3083BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3084 PROLOGUE_4_ARGS
3085 sub xSP, 20h
3086
3087 fninit
3088 fld tword [A3]
3089 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3090 fbstp tword [A2]
3091
3092 fnstsw word [A1]
3093
3094 fninit
3095 add xSP, 20h
3096 EPILOGUE_4_ARGS
3097ENDPROC iemAImpl_fst_r80_to_d80
3098
3099
3100;;
3101; FPU instruction working on two 80-bit floating point values.
3102;
3103; @param 1 The instruction
3104;
3105; @param A0 FPU context (fxsave).
3106; @param A1 Pointer to a IEMFPURESULT for the output.
3107; @param A2 Pointer to the first 80-bit value (ST0)
3108; @param A3 Pointer to the second 80-bit value (STn).
3109;
3110%macro IEMIMPL_FPU_R80_BY_R80 2
3111BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3112 PROLOGUE_4_ARGS
3113 sub xSP, 20h
3114
3115 fninit
3116 fld tword [A3]
3117 fld tword [A2]
3118 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3119 %1 %2
3120
3121 fnstsw word [A1 + IEMFPURESULT.FSW]
3122 fnclex
3123 fstp tword [A1 + IEMFPURESULT.r80Result]
3124
3125 fninit
3126 add xSP, 20h
3127 EPILOGUE_4_ARGS
3128ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3129%endmacro
3130
3131IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3132IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3133IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3134IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3135IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3136IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3137IEMIMPL_FPU_R80_BY_R80 fprem, {}
3138IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3139IEMIMPL_FPU_R80_BY_R80 fscale, {}
3140
3141
3142;;
3143; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3144; storing the result in ST1 and popping the stack.
3145;
3146; @param 1 The instruction
3147;
3148; @param A0 FPU context (fxsave).
3149; @param A1 Pointer to a IEMFPURESULT for the output.
3150; @param A2 Pointer to the first 80-bit value (ST1).
3151; @param A3 Pointer to the second 80-bit value (ST0).
3152;
3153%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3154BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3155 PROLOGUE_4_ARGS
3156 sub xSP, 20h
3157
3158 fninit
3159 fld tword [A2]
3160 fld tword [A3]
3161 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3162 %1
3163
3164 fnstsw word [A1 + IEMFPURESULT.FSW]
3165 fnclex
3166 fstp tword [A1 + IEMFPURESULT.r80Result]
3167
3168 fninit
3169 add xSP, 20h
3170 EPILOGUE_4_ARGS
3171ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3172%endmacro
3173
3174IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3175IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3176IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3177
3178
3179;;
3180; FPU instruction working on two 80-bit floating point values, only
3181; returning FSW.
3182;
3183; @param 1 The instruction
3184;
3185; @param A0 FPU context (fxsave).
3186; @param A1 Pointer to a uint16_t for the resulting FSW.
3187; @param A2 Pointer to the first 80-bit value.
3188; @param A3 Pointer to the second 80-bit value.
3189;
3190%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3192 PROLOGUE_4_ARGS
3193 sub xSP, 20h
3194
3195 fninit
3196 fld tword [A3]
3197 fld tword [A2]
3198 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3199 %1 st0, st1
3200
3201 fnstsw word [A1]
3202
3203 fninit
3204 add xSP, 20h
3205 EPILOGUE_4_ARGS
3206ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3207%endmacro
3208
3209IEMIMPL_FPU_R80_BY_R80_FSW fcom
3210IEMIMPL_FPU_R80_BY_R80_FSW fucom
3211
3212
3213;;
3214; FPU instruction working on two 80-bit floating point values,
3215; returning FSW and EFLAGS (eax).
3216;
3217; @param 1 The instruction
3218;
3219; @returns EFLAGS in EAX.
3220; @param A0 FPU context (fxsave).
3221; @param A1 Pointer to a uint16_t for the resulting FSW.
3222; @param A2 Pointer to the first 80-bit value.
3223; @param A3 Pointer to the second 80-bit value.
3224;
3225%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3226BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3227 PROLOGUE_4_ARGS
3228 sub xSP, 20h
3229
3230 fninit
3231 fld tword [A3]
3232 fld tword [A2]
3233 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3234 %1 st1
3235
3236 fnstsw word [A1]
3237 pushf
3238 pop xAX
3239
3240 fninit
3241 add xSP, 20h
3242 EPILOGUE_4_ARGS
3243ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3244%endmacro
3245
3246IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3247IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3248
3249
3250;;
3251; FPU instruction working on one 80-bit floating point value.
3252;
3253; @param 1 The instruction
3254;
3255; @param A0 FPU context (fxsave).
3256; @param A1 Pointer to a IEMFPURESULT for the output.
3257; @param A2 Pointer to the 80-bit value.
3258;
3259%macro IEMIMPL_FPU_R80 1
3260BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3261 PROLOGUE_3_ARGS
3262 sub xSP, 20h
3263
3264 fninit
3265 fld tword [A2]
3266 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3267 %1
3268
3269 fnstsw word [A1 + IEMFPURESULT.FSW]
3270 fnclex
3271 fstp tword [A1 + IEMFPURESULT.r80Result]
3272
3273 fninit
3274 add xSP, 20h
3275 EPILOGUE_3_ARGS
3276ENDPROC iemAImpl_ %+ %1 %+ _r80
3277%endmacro
3278
3279IEMIMPL_FPU_R80 fchs
3280IEMIMPL_FPU_R80 fabs
3281IEMIMPL_FPU_R80 f2xm1
3282IEMIMPL_FPU_R80 fsqrt
3283IEMIMPL_FPU_R80 frndint
3284IEMIMPL_FPU_R80 fsin
3285IEMIMPL_FPU_R80 fcos
3286
3287
3288;;
3289; FPU instruction working on one 80-bit floating point value, only
3290; returning FSW.
3291;
3292; @param 1 The instruction
3293; @param 2 Non-zero to also restore FTW.
3294;
3295; @param A0 FPU context (fxsave).
3296; @param A1 Pointer to a uint16_t for the resulting FSW.
3297; @param A2 Pointer to the 80-bit value.
3298;
3299%macro IEMIMPL_FPU_R80_FSW 2
3300BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3301 PROLOGUE_3_ARGS
3302 sub xSP, 20h
3303
3304 fninit
3305 fld tword [A2]
3306%if %2 != 0
3307 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3308%else
3309 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3310%endif
3311 %1
3312
3313 fnstsw word [A1]
3314
3315 fninit
3316 add xSP, 20h
3317 EPILOGUE_3_ARGS
3318ENDPROC iemAImpl_ %+ %1 %+ _r80
3319%endmacro
3320
3321IEMIMPL_FPU_R80_FSW ftst, 0
3322IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3323
3324
3325
3326;;
3327; FPU instruction loading a 80-bit floating point constant.
3328;
3329; @param 1 The instruction
3330;
3331; @param A0 FPU context (fxsave).
3332; @param A1 Pointer to a IEMFPURESULT for the output.
3333;
3334%macro IEMIMPL_FPU_R80_CONST 1
3335BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3336 PROLOGUE_2_ARGS
3337 sub xSP, 20h
3338
3339 fninit
3340 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3341 %1
3342
3343 fnstsw word [A1 + IEMFPURESULT.FSW]
3344 fnclex
3345 fstp tword [A1 + IEMFPURESULT.r80Result]
3346
3347 fninit
3348 add xSP, 20h
3349 EPILOGUE_2_ARGS
3350ENDPROC iemAImpl_ %+ %1 %+
3351%endmacro
3352
3353IEMIMPL_FPU_R80_CONST fld1
3354IEMIMPL_FPU_R80_CONST fldl2t
3355IEMIMPL_FPU_R80_CONST fldl2e
3356IEMIMPL_FPU_R80_CONST fldpi
3357IEMIMPL_FPU_R80_CONST fldlg2
3358IEMIMPL_FPU_R80_CONST fldln2
3359IEMIMPL_FPU_R80_CONST fldz
3360
3361
3362;;
3363; FPU instruction working on one 80-bit floating point value, outputing two.
3364;
3365; @param 1 The instruction
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3369; @param A2 Pointer to the 80-bit value.
3370;
3371%macro IEMIMPL_FPU_R80_R80 1
3372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3373 PROLOGUE_3_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A2]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 %1
3380
3381 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3382 fnclex
3383 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3384 fnclex
3385 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3386
3387 fninit
3388 add xSP, 20h
3389 EPILOGUE_3_ARGS
3390ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3391%endmacro
3392
3393IEMIMPL_FPU_R80_R80 fptan
3394IEMIMPL_FPU_R80_R80 fxtract
3395IEMIMPL_FPU_R80_R80 fsincos
3396
3397
3398
3399
3400;---------------------- SSE and MMX Operations ----------------------
3401
3402;; @todo what do we need to do for MMX?
3403%macro IEMIMPL_MMX_PROLOGUE 0
3404%endmacro
3405%macro IEMIMPL_MMX_EPILOGUE 0
3406%endmacro
3407
3408;; @todo what do we need to do for SSE?
3409%macro IEMIMPL_SSE_PROLOGUE 0
3410%endmacro
3411%macro IEMIMPL_SSE_EPILOGUE 0
3412%endmacro
3413
3414
3415;;
3416; Media instruction working on two full sized registers.
3417;
3418; @param 1 The instruction
3419;
3420; @param A0 FPU context (fxsave).
3421; @param A1 Pointer to the first media register size operand (input/output).
3422; @param A2 Pointer to the second media register size operand (input).
3423;
3424%macro IEMIMPL_MEDIA_F2 1
3425BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3426 PROLOGUE_3_ARGS
3427 IEMIMPL_MMX_PROLOGUE
3428
3429 movq mm0, [A1]
3430 movq mm1, [A2]
3431 %1 mm0, mm1
3432 movq [A1], mm0
3433
3434 IEMIMPL_MMX_EPILOGUE
3435 EPILOGUE_3_ARGS
3436ENDPROC iemAImpl_ %+ %1 %+ _u64
3437
3438BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3439 PROLOGUE_3_ARGS
3440 IEMIMPL_SSE_PROLOGUE
3441
3442 movdqu xmm0, [A1]
3443 movdqu xmm1, [A2]
3444 %1 xmm0, xmm1
3445 movdqu [A1], xmm0
3446
3447 IEMIMPL_SSE_EPILOGUE
3448 EPILOGUE_3_ARGS
3449ENDPROC iemAImpl_ %+ %1 %+ _u128
3450%endmacro
3451
3452IEMIMPL_MEDIA_F2 pxor
3453IEMIMPL_MEDIA_F2 pcmpeqb
3454IEMIMPL_MEDIA_F2 pcmpeqw
3455IEMIMPL_MEDIA_F2 pcmpeqd
3456
3457
3458;;
3459; Media instruction working on one full sized and one half sized register (lower half).
3460;
3461; @param 1 The instruction
3462; @param 2 1 if MMX is included, 0 if not.
3463;
3464; @param A0 FPU context (fxsave).
3465; @param A1 Pointer to the first full sized media register operand (input/output).
3466; @param A2 Pointer to the second half sized media register operand (input).
3467;
3468%macro IEMIMPL_MEDIA_F1L1 2
3469 %if %2 != 0
3470BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3471 PROLOGUE_3_ARGS
3472 IEMIMPL_MMX_PROLOGUE
3473
3474 movq mm0, [A1]
3475 movd mm1, [A2]
3476 %1 mm0, mm1
3477 movq [A1], mm0
3478
3479 IEMIMPL_MMX_EPILOGUE
3480 EPILOGUE_3_ARGS
3481ENDPROC iemAImpl_ %+ %1 %+ _u64
3482 %endif
3483
3484BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3485 PROLOGUE_3_ARGS
3486 IEMIMPL_SSE_PROLOGUE
3487
3488 movdqu xmm0, [A1]
3489 movq xmm1, [A2]
3490 %1 xmm0, xmm1
3491 movdqu [A1], xmm0
3492
3493 IEMIMPL_SSE_EPILOGUE
3494 EPILOGUE_3_ARGS
3495ENDPROC iemAImpl_ %+ %1 %+ _u128
3496%endmacro
3497
3498IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3499IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3500IEMIMPL_MEDIA_F1L1 punpckldq, 1
3501IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3502
3503
3504;;
3505; Media instruction working on one full sized and one half sized register (high half).
3506;
3507; @param 1 The instruction
3508; @param 2 1 if MMX is included, 0 if not.
3509;
3510; @param A0 FPU context (fxsave).
3511; @param A1 Pointer to the first full sized media register operand (input/output).
3512; @param A2 Pointer to the second full sized media register operand, where we
3513; will only use the upper half (input).
3514;
3515%macro IEMIMPL_MEDIA_F1H1 2
3516 %if %2 != 0
3517BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3518 PROLOGUE_3_ARGS
3519 IEMIMPL_MMX_PROLOGUE
3520
3521 movq mm0, [A1]
3522 movq mm1, [A2]
3523 %1 mm0, mm1
3524 movq [A1], mm0
3525
3526 IEMIMPL_MMX_EPILOGUE
3527 EPILOGUE_3_ARGS
3528ENDPROC iemAImpl_ %+ %1 %+ _u64
3529 %endif
3530
3531BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3532 PROLOGUE_3_ARGS
3533 IEMIMPL_SSE_PROLOGUE
3534
3535 movdqu xmm0, [A1]
3536 movdqu xmm1, [A2]
3537 %1 xmm0, xmm1
3538 movdqu [A1], xmm0
3539
3540 IEMIMPL_SSE_EPILOGUE
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _u128
3543%endmacro
3544
3545IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3546IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3547IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3548IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3549
3550
3551;
3552; Shufflers with evil 8-bit immediates.
3553;
3554
3555BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3556 PROLOGUE_4_ARGS
3557 IEMIMPL_MMX_PROLOGUE
3558
3559 movq mm0, [A1]
3560 movq mm1, [A2]
3561 lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3562 lea T1, [.imm0 xWrtRIP]
3563 lea T1, [T1 + T0]
3564 call T1
3565 movq [A1], mm0
3566
3567 IEMIMPL_MMX_EPILOGUE
3568 EPILOGUE_4_ARGS
3569%assign bImm 0
3570%rep 256
3571.imm %+ bImm:
3572 pshufw mm0, mm1, bImm
3573 ret
3574 %assign bImm bImm + 1
3575%endrep
3576.immEnd: ; 256*5 == 0x500
3577dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3578dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3579ENDPROC iemAImpl_pshufw
3580
3581
3582%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3583BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3584 PROLOGUE_4_ARGS
3585 IEMIMPL_SSE_PROLOGUE
3586
3587 movdqu xmm0, [A1]
3588 movdqu xmm1, [A2]
3589 lea T1, [.imm0 xWrtRIP]
3590 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3591 lea T1, [T1 + T0*2]
3592 call T1
3593 movdqu [A1], xmm0
3594
3595 IEMIMPL_SSE_EPILOGUE
3596 EPILOGUE_4_ARGS
3597 %assign bImm 0
3598 %rep 256
3599.imm %+ bImm:
3600 %1 xmm0, xmm1, bImm
3601 ret
3602 %assign bImm bImm + 1
3603 %endrep
3604.immEnd: ; 256*6 == 0x600
3605dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3606dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3607ENDPROC iemAImpl_ %+ %1
3608%endmacro
3609
3610IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3611IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3612IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3613
3614
3615;
3616; Move byte mask.
3617;
3618
3619BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3620 PROLOGUE_3_ARGS
3621 IEMIMPL_MMX_PROLOGUE
3622
3623 mov T0, [A1]
3624 movq mm1, [A2]
3625 pmovmskb T0, mm1
3626 mov [A1], T0
3627%ifdef RT_ARCH_X86
3628 mov dword [A1 + 4], 0
3629%endif
3630 IEMIMPL_MMX_EPILOGUE
3631 EPILOGUE_3_ARGS
3632ENDPROC iemAImpl_pmovmskb_u64
3633
3634BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3635 PROLOGUE_3_ARGS
3636 IEMIMPL_SSE_PROLOGUE
3637
3638 mov T0, [A1]
3639 movdqu xmm1, [A2]
3640 pmovmskb T0, xmm1
3641 mov [A1], T0
3642%ifdef RT_ARCH_X86
3643 mov dword [A1 + 4], 0
3644%endif
3645 IEMIMPL_SSE_EPILOGUE
3646 EPILOGUE_3_ARGS
3647ENDPROC iemAImpl_pmovmskb_u128
3648
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette