VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95499

Last change on this file since 95499 was 95499, checked in by vboxsync, 2 years ago

VMM/IEM: [v]pshufhb. bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 112.5 KB
Line 
1; $Id: IEMAllAImpl.asm 95499 2022-07-04 12:52:29Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle Corporation
8;
9; This file is part of VirtualBox Open Source Edition (OSE), as
10; available from http://www.virtualbox.org. This file is free software;
11; you can redistribute it and/or modify it under the terms of the GNU
12; General Public License (GPL) as published by the Free Software
13; Foundation, in version 2 as it comes in the "COPYING" file of the
14; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16;
17
18
19;*********************************************************************************************************************************
20;* Header Files *
21;*********************************************************************************************************************************
22%include "VBox/asmdefs.mac"
23%include "VBox/err.mac"
24%include "iprt/x86.mac"
25
26
27;*********************************************************************************************************************************
28;* Defined Constants And Macros *
29;*********************************************************************************************************************************
30
31;;
32; RET XX / RET wrapper for fastcall.
33;
34%macro RET_FASTCALL 1
35%ifdef RT_ARCH_X86
36 %ifdef RT_OS_WINDOWS
37 ret %1
38 %else
39 ret
40 %endif
41%else
42 ret
43%endif
44%endmacro
45
46;;
47; NAME for fastcall functions.
48;
49;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50; escaping (or whatever the dollar is good for here). Thus the ugly
51; prefix argument.
52;
53%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54%ifdef RT_ARCH_X86
55 %ifdef RT_OS_WINDOWS
56 %undef NAME_FASTCALL
57 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58 %endif
59%endif
60
61;;
62; BEGINPROC for fastcall functions.
63;
64; @param 1 The function name (C).
65; @param 2 The argument size on x86.
66;
67%macro BEGINPROC_FASTCALL 2
68 %ifdef ASM_FORMAT_PE
69 export %1=NAME_FASTCALL(%1,%2,$@)
70 %endif
71 %ifdef __NASM__
72 %ifdef ASM_FORMAT_OMF
73 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74 %endif
75 %endif
76 %ifndef ASM_FORMAT_BIN
77 global NAME_FASTCALL(%1,%2,$@)
78 %endif
79NAME_FASTCALL(%1,%2,@):
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %endif
143
144 %ifdef ASM_CALL64_MSC
145 %define A0 rcx
146 %define A0_32 ecx
147 %define A0_16 cx
148 %define A0_8 cl
149
150 %define A1 rdx
151 %define A1_32 edx
152 %define A1_16 dx
153 %define A1_8 dl
154
155 %define A2 r8
156 %define A2_32 r8d
157 %define A2_16 r8w
158 %define A2_8 r8b
159
160 %define A3 r9
161 %define A3_32 r9d
162 %define A3_16 r9w
163 %endif
164
165 %define T0 rax
166 %define T0_32 eax
167 %define T0_16 ax
168 %define T0_8 al
169
170 %define T1 r11
171 %define T1_32 r11d
172 %define T1_16 r11w
173 %define T1_8 r11b
174
175 %define T2 r10 ; only AMD64
176 %define T2_32 r10d
177 %define T2_16 r10w
178 %define T2_8 r10b
179
180%else
181 ; x86
182 %macro PROLOGUE_1_ARGS 0
183 push edi
184 %endmacro
185 %macro EPILOGUE_1_ARGS 0
186 pop edi
187 ret 0
188 %endmacro
189 %macro EPILOGUE_1_ARGS_EX 1
190 pop edi
191 ret %1
192 %endmacro
193
194 %macro PROLOGUE_2_ARGS 0
195 push edi
196 %endmacro
197 %macro EPILOGUE_2_ARGS 0
198 pop edi
199 ret 0
200 %endmacro
201 %macro EPILOGUE_2_ARGS_EX 1
202 pop edi
203 ret %1
204 %endmacro
205
206 %macro PROLOGUE_3_ARGS 0
207 push ebx
208 mov ebx, [esp + 4 + 4]
209 push edi
210 %endmacro
211 %macro EPILOGUE_3_ARGS_EX 1
212 %if (%1) < 4
213 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214 %endif
215 pop edi
216 pop ebx
217 ret %1
218 %endmacro
219 %macro EPILOGUE_3_ARGS 0
220 EPILOGUE_3_ARGS_EX 4
221 %endmacro
222
223 %macro PROLOGUE_4_ARGS 0
224 push ebx
225 push edi
226 push esi
227 mov ebx, [esp + 12 + 4 + 0]
228 mov esi, [esp + 12 + 4 + 4]
229 %endmacro
230 %macro EPILOGUE_4_ARGS_EX 1
231 %if (%1) < 8
232 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233 %endif
234 pop esi
235 pop edi
236 pop ebx
237 ret %1
238 %endmacro
239 %macro EPILOGUE_4_ARGS 0
240 EPILOGUE_4_ARGS_EX 8
241 %endmacro
242
243 %define A0 ecx
244 %define A0_32 ecx
245 %define A0_16 cx
246 %define A0_8 cl
247
248 %define A1 edx
249 %define A1_32 edx
250 %define A1_16 dx
251 %define A1_8 dl
252
253 %define A2 ebx
254 %define A2_32 ebx
255 %define A2_16 bx
256 %define A2_8 bl
257
258 %define A3 esi
259 %define A3_32 esi
260 %define A3_16 si
261
262 %define T0 eax
263 %define T0_32 eax
264 %define T0_16 ax
265 %define T0_8 al
266
267 %define T1 edi
268 %define T1_32 edi
269 %define T1_16 di
270%endif
271
272
273;;
274; Load the relevant flags from [%1] if there are undefined flags (%3).
275;
276; @remarks Clobbers T0, stack. Changes EFLAGS.
277; @param A2 The register pointing to the flags.
278; @param 1 The parameter (A0..A3) pointing to the eflags.
279; @param 2 The set of modified flags.
280; @param 3 The set of undefined flags.
281;
282%macro IEM_MAYBE_LOAD_FLAGS 3
283 ;%if (%3) != 0
284 pushf ; store current flags
285 mov T0_32, [%1] ; load the guest flags
286 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
287 and T0_32, (%2 | %3) ; select the modified and undefined flags.
288 or [xSP], T0 ; merge guest flags with host flags.
289 popf ; load the mixed flags.
290 ;%endif
291%endmacro
292
293;;
294; Update the flag.
295;
296; @remarks Clobbers T0, T1, stack.
297; @param 1 The register pointing to the EFLAGS.
298; @param 2 The mask of modified flags to save.
299; @param 3 The mask of undefined flags to (maybe) save.
300;
301%macro IEM_SAVE_FLAGS 3
302 %if (%2 | %3) != 0
303 pushf
304 pop T1
305 mov T0_32, [%1] ; flags
306 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
307 and T1_32, (%2 | %3) ; select the modified and undefined flags.
308 or T0_32, T1_32 ; combine the flags.
309 mov [%1], T0_32 ; save the flags.
310 %endif
311%endmacro
312
313;;
314; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 Mask of additional flags to always clear
320; @param 4 Mask of additional flags to always set.
321;
322%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323 %if (%2 | %3 | %4) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; load flags.
327 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
328 and T1_32, (%2) ; select the modified flags.
329 or T0_32, T1_32 ; combine the flags.
330 %if (%4) != 0
331 or T0_32, %4 ; add the always set flags.
332 %endif
333 mov [%1], T0_32 ; save the result.
334 %endif
335%endmacro
336
337;;
338; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339; signed input (%4[%5]) and parity index (%6).
340;
341; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344;
345; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 The result register to set SF by.
350; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351; @param 6 The (full) register containing the parity table index. Will be modified!
352
353%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354 %ifdef RT_ARCH_AMD64
355 pushf
356 pop T2
357 %else
358 push T0
359 pushf
360 pop T0
361 %endif
362 mov T1_32, [%1] ; load flags.
363 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364 %ifdef RT_ARCH_AMD64
365 and T2_32, (%2) ; select the modified flags.
366 or T1_32, T2_32 ; combine the flags.
367 %else
368 and T0_32, (%2) ; select the modified flags.
369 or T1_32, T0_32 ; combine the flags.
370 pop T0
371 %endif
372
373 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
374 bt %4, %5 - 1
375 jnc %%sf_clear
376 or T1_32, X86_EFL_SF
377 %%sf_clear:
378
379 ; Parity last.
380 and %6, 0xff
381 %ifdef RT_ARCH_AMD64
382 lea T2, [NAME(g_afParity) xWrtRIP]
383 or T1_8, [T2 + %6]
384 %else
385 or T1_8, [NAME(g_afParity) + %6]
386 %endif
387
388 mov [%1], T1_32 ; save the result.
389%endmacro
390
391;;
392; Calculates the new EFLAGS using fixed clear and set bit masks.
393;
394; @remarks Clobbers T0.
395; @param 1 The register pointing to the EFLAGS.
396; @param 2 Mask of additional flags to always clear
397; @param 3 Mask of additional flags to always set.
398;
399%macro IEM_ADJUST_FLAGS 3
400 %if (%2 | %3) != 0
401 mov T0_32, [%1] ; Load flags.
402 %if (%2) != 0
403 and T0_32, ~(%2) ; Remove the always cleared flags.
404 %endif
405 %if (%3) != 0
406 or T0_32, %3 ; Add the always set flags.
407 %endif
408 mov [%1], T0_32 ; Save the result.
409 %endif
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0, %4, EFLAGS.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419; @param 4 The (full) register containing the parity table index. Will be modified!
420;
421%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422 mov T0_32, [%1] ; Load flags.
423 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
424 %if (%3) != 0
425 or T0_32, %3 ; Add the always set flags.
426 %endif
427 and %4, 0xff
428 %ifdef RT_ARCH_AMD64
429 lea T2, [NAME(g_afParity) xWrtRIP]
430 or T0_8, [T2 + %4]
431 %else
432 or T0_8, [NAME(g_afParity) + %4]
433 %endif
434 mov [%1], T0_32 ; Save the result.
435%endmacro
436
437
438;*********************************************************************************************************************************
439;* External Symbols *
440;*********************************************************************************************************************************
441extern NAME(g_afParity)
442
443
444;;
445; Macro for implementing a binary operator.
446;
447; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448; variants, except on 32-bit system where the 64-bit accesses requires hand
449; coding.
450;
451; All the functions takes a pointer to the destination memory operand in A0,
452; the source register operand in A1 and a pointer to eflags in A2.
453;
454; @param 1 The instruction mnemonic.
455; @param 2 Non-zero if there should be a locked version.
456; @param 3 The modified flags.
457; @param 4 The undefined flags.
458;
459%macro IEMIMPL_BIN_OP 4
460BEGINCODE
461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462 PROLOGUE_3_ARGS
463 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464 %1 byte [A0], A1_8
465 IEM_SAVE_FLAGS A2, %3, %4
466 EPILOGUE_3_ARGS
467ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470 PROLOGUE_3_ARGS
471 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472 %1 word [A0], A1_16
473 IEM_SAVE_FLAGS A2, %3, %4
474 EPILOGUE_3_ARGS
475ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478 PROLOGUE_3_ARGS
479 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480 %1 dword [A0], A1_32
481 IEM_SAVE_FLAGS A2, %3, %4
482 EPILOGUE_3_ARGS
483ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485 %ifdef RT_ARCH_AMD64
486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487 PROLOGUE_3_ARGS
488 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489 %1 qword [A0], A1
490 IEM_SAVE_FLAGS A2, %3, %4
491 EPILOGUE_3_ARGS_EX 8
492ENDPROC iemAImpl_ %+ %1 %+ _u64
493 %endif ; RT_ARCH_AMD64
494
495 %if %2 != 0 ; locked versions requested?
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 lock %1 byte [A0], A1_8
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 lock %1 word [A0], A1_16
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514 PROLOGUE_3_ARGS
515 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516 lock %1 dword [A0], A1_32
517 IEM_SAVE_FLAGS A2, %3, %4
518 EPILOGUE_3_ARGS
519ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521 %ifdef RT_ARCH_AMD64
522BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523 PROLOGUE_3_ARGS
524 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525 lock %1 qword [A0], A1
526 IEM_SAVE_FLAGS A2, %3, %4
527 EPILOGUE_3_ARGS_EX 8
528ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529 %endif ; RT_ARCH_AMD64
530 %endif ; locked
531%endmacro
532
533; instr,lock, modified-flags, undefined flags
534IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
535IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
536IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
537IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
538IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
539IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
540IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
541IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
542IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
543
544
545;;
546; Macro for implementing a binary operator, VEX variant with separate input/output.
547;
548; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549; where the 64-bit accesses requires hand coding.
550;
551; All the functions takes a pointer to the destination memory operand in A0,
552; the first source register operand in A1, the second source register operand
553; in A2 and a pointer to eflags in A3.
554;
555; @param 1 The instruction mnemonic.
556; @param 2 The modified flags.
557; @param 3 The undefined flags.
558;
559%macro IEMIMPL_VEX_BIN_OP 3
560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561 PROLOGUE_4_ARGS
562 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563 %1 T0_32, A1_32, A2_32
564 mov [A0], T0_32
565 IEM_SAVE_FLAGS A3, %2, %3
566 EPILOGUE_4_ARGS
567ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569 %ifdef RT_ARCH_AMD64
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0, A1, A2
574 mov [A0], T0
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u64
578 %endif ; RT_ARCH_AMD64
579%endmacro
580
581; instr, modified-flags, undefined-flags
582IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
583IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
584IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
585
586;;
587; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
588;
589; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
590; where the 64-bit accesses requires hand coding.
591;
592; All the functions takes a pointer to the destination memory operand in A0,
593; the source register operand in A1 and a pointer to eflags in A2.
594;
595; @param 1 The instruction mnemonic.
596; @param 2 The modified flags.
597; @param 3 The undefined flags.
598;
599%macro IEMIMPL_VEX_BIN_OP_2 3
600BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
601 PROLOGUE_4_ARGS
602 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
603 mov T0_32, [A0]
604 %1 T0_32, A1_32
605 mov [A0], T0_32
606 IEM_SAVE_FLAGS A2, %2, %3
607 EPILOGUE_4_ARGS
608ENDPROC iemAImpl_ %+ %1 %+ _u32
609
610 %ifdef RT_ARCH_AMD64
611BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
612 PROLOGUE_4_ARGS
613 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
614 mov T0, [A0]
615 %1 T0, A1
616 mov [A0], T0
617 IEM_SAVE_FLAGS A2, %2, %3
618 EPILOGUE_4_ARGS
619ENDPROC iemAImpl_ %+ %1 %+ _u64
620 %endif ; RT_ARCH_AMD64
621%endmacro
622
623; instr, modified-flags, undefined-flags
624IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
625IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
626IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
627
628
629;;
630; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
631;
632; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
633; where the 64-bit accesses requires hand coding.
634;
635; All the functions takes a pointer to the destination memory operand in A0,
636; the first source register operand in A1, the second source register operand
637; in A2 and a pointer to eflags in A3.
638;
639; @param 1 The instruction mnemonic.
640; @param 2 Fallback instruction if applicable.
641; @param 3 Whether to emit fallback or not.
642;
643%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
645 PROLOGUE_3_ARGS
646 %1 T0_32, A1_32, A2_32
647 mov [A0], T0_32
648 EPILOGUE_3_ARGS
649ENDPROC iemAImpl_ %+ %1 %+ _u32
650
651 %if %3
652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
653 PROLOGUE_3_ARGS
654 %ifdef ASM_CALL64_GCC
655 mov cl, A2_8
656 %2 A1_32, cl
657 mov [A0], A1_32
658 %else
659 xchg A2, A0
660 %2 A1_32, cl
661 mov [A2], A1_32
662 %endif
663 EPILOGUE_3_ARGS
664ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
665 %endif
666
667 %ifdef RT_ARCH_AMD64
668BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
669 PROLOGUE_3_ARGS
670 %1 T0, A1, A2
671 mov [A0], T0
672 EPILOGUE_3_ARGS
673ENDPROC iemAImpl_ %+ %1 %+ _u64
674
675 %if %3
676BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
677 PROLOGUE_3_ARGS
678 %ifdef ASM_CALL64_GCC
679 mov cl, A2_8
680 %2 A1, cl
681 mov [A0], A1_32
682 %else
683 xchg A2, A0
684 %2 A1, cl
685 mov [A2], A1_32
686 %endif
687 mov [A0], A1
688 EPILOGUE_3_ARGS
689ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
690 %endif
691 %endif ; RT_ARCH_AMD64
692%endmacro
693
694; instr, fallback instr, emit fallback
695IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
696IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
697IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
698IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
699IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
700
701
702;
703; RORX uses a immediate byte for the shift count, so we only do
704; fallback implementation of that one.
705;
706BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
707 PROLOGUE_3_ARGS
708 %ifdef ASM_CALL64_GCC
709 mov cl, A2_8
710 ror A1_32, cl
711 mov [A0], A1_32
712 %else
713 xchg A2, A0
714 ror A1_32, cl
715 mov [A2], A1_32
716 %endif
717 EPILOGUE_3_ARGS
718ENDPROC iemAImpl_rorx_u32
719
720 %ifdef RT_ARCH_AMD64
721BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
722 PROLOGUE_3_ARGS
723 %ifdef ASM_CALL64_GCC
724 mov cl, A2_8
725 ror A1, cl
726 mov [A0], A1_32
727 %else
728 xchg A2, A0
729 ror A1, cl
730 mov [A2], A1_32
731 %endif
732 mov [A0], A1
733 EPILOGUE_3_ARGS
734ENDPROC iemAImpl_rorx_u64
735 %endif ; RT_ARCH_AMD64
736
737
738;
739; MULX
740;
741BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
742 PROLOGUE_4_ARGS
743%ifdef ASM_CALL64_GCC
744 ; A2_32 is EDX - prefect
745 mulx T0_32, T1_32, A3_32
746 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
747 mov [A0], T0_32
748%else
749 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
750 xchg A1, A2
751 mulx T0_32, T1_32, A3_32
752 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
753 mov [A0], T0_32
754%endif
755 EPILOGUE_4_ARGS
756ENDPROC iemAImpl_mulx_u32
757
758
759BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
760 PROLOGUE_4_ARGS
761%ifdef ASM_CALL64_GCC
762 ; A2_32 is EDX, T0_32 is EAX
763 mov eax, A3_32
764 mul A2_32
765 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
766 mov [A0], edx
767%else
768 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
769 xchg A1, A2
770 mov eax, A3_32
771 mul A2_32
772 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
773 mov [A0], edx
774%endif
775 EPILOGUE_4_ARGS
776ENDPROC iemAImpl_mulx_u32_fallback
777
778%ifdef RT_ARCH_AMD64
779BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2 is RDX - prefect
783 mulx T0, T1, A3
784 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0
786%else
787 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
788 xchg A1, A2
789 mulx T0, T1, A3
790 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u64
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2 is RDX, T0 is RAX
801 mov rax, A3
802 mul A2
803 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], rdx
805%else
806 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
807 xchg A1, A2
808 mov rax, A3
809 mul A2
810 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], rdx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u64_fallback
815
816%endif
817
818
819;;
820; Macro for implementing a bit operator.
821;
822; This will generate code for the 16, 32 and 64 bit accesses with locked
823; variants, except on 32-bit system where the 64-bit accesses requires hand
824; coding.
825;
826; All the functions takes a pointer to the destination memory operand in A0,
827; the source register operand in A1 and a pointer to eflags in A2.
828;
829; @param 1 The instruction mnemonic.
830; @param 2 Non-zero if there should be a locked version.
831; @param 3 The modified flags.
832; @param 4 The undefined flags.
833;
834%macro IEMIMPL_BIT_OP 4
835BEGINCODE
836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
837 PROLOGUE_3_ARGS
838 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
839 %1 word [A0], A1_16
840 IEM_SAVE_FLAGS A2, %3, %4
841 EPILOGUE_3_ARGS
842ENDPROC iemAImpl_ %+ %1 %+ _u16
843
844BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
845 PROLOGUE_3_ARGS
846 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
847 %1 dword [A0], A1_32
848 IEM_SAVE_FLAGS A2, %3, %4
849 EPILOGUE_3_ARGS
850ENDPROC iemAImpl_ %+ %1 %+ _u32
851
852 %ifdef RT_ARCH_AMD64
853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
854 PROLOGUE_3_ARGS
855 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
856 %1 qword [A0], A1
857 IEM_SAVE_FLAGS A2, %3, %4
858 EPILOGUE_3_ARGS_EX 8
859ENDPROC iemAImpl_ %+ %1 %+ _u64
860 %endif ; RT_ARCH_AMD64
861
862 %if %2 != 0 ; locked versions requested?
863
864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
865 PROLOGUE_3_ARGS
866 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
867 lock %1 word [A0], A1_16
868 IEM_SAVE_FLAGS A2, %3, %4
869 EPILOGUE_3_ARGS
870ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
871
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 lock %1 dword [A0], A1_32
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
879
880 %ifdef RT_ARCH_AMD64
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 lock %1 qword [A0], A1
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS_EX 8
887ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
888 %endif ; RT_ARCH_AMD64
889 %endif ; locked
890%endmacro
891IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
892IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
893IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
894IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
895
896;;
897; Macro for implementing a bit search operator.
898;
899; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
900; system where the 64-bit accesses requires hand coding.
901;
902; All the functions takes a pointer to the destination memory operand in A0,
903; the source register operand in A1 and a pointer to eflags in A2.
904;
905; In the ZF case the destination register is 'undefined', however it seems that
906; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
907; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
908; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
909; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
910;
911; @param 1 The instruction mnemonic.
912; @param 2 The modified flags.
913; @param 3 The undefined flags.
914; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
915;
916%macro IEMIMPL_BIT_OP2 4
917BEGINCODE
918BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
919 PROLOGUE_3_ARGS
920 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
921 %1 T0_16, A1_16
922%if %4 != 0
923 jz .unchanged_dst
924%endif
925 mov [A0], T0_16
926.unchanged_dst:
927 IEM_SAVE_FLAGS A2, %2, %3
928 EPILOGUE_3_ARGS
929ENDPROC iemAImpl_ %+ %1 %+ _u16
930
931BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
932 PROLOGUE_3_ARGS
933 %1 T1_16, A1_16
934%if %4 != 0
935 jz .unchanged_dst
936%endif
937 mov [A0], T1_16
938 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
939 EPILOGUE_3_ARGS
940.unchanged_dst:
941 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
942 EPILOGUE_3_ARGS
943ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
944
945BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
946 PROLOGUE_3_ARGS
947 %1 T0_16, A1_16
948%if %4 != 0
949 jz .unchanged_dst
950%endif
951 mov [A0], T0_16
952.unchanged_dst:
953 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
954 EPILOGUE_3_ARGS
955ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
956
957
958BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
959 PROLOGUE_3_ARGS
960 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
961 %1 T0_32, A1_32
962%if %4 != 0
963 jz .unchanged_dst
964%endif
965 mov [A0], T0_32
966.unchanged_dst:
967 IEM_SAVE_FLAGS A2, %2, %3
968 EPILOGUE_3_ARGS
969ENDPROC iemAImpl_ %+ %1 %+ _u32
970
971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
972 PROLOGUE_3_ARGS
973 %1 T1_32, A1_32
974%if %4 != 0
975 jz .unchanged_dst
976%endif
977 mov [A0], T1_32
978 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
979 EPILOGUE_3_ARGS
980.unchanged_dst:
981 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
982 EPILOGUE_3_ARGS
983ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
984
985BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
986 PROLOGUE_3_ARGS
987 %1 T0_32, A1_32
988%if %4 != 0
989 jz .unchanged_dst
990%endif
991 mov [A0], T0_32
992.unchanged_dst:
993 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
996
997
998 %ifdef RT_ARCH_AMD64
999
1000BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1001 PROLOGUE_3_ARGS
1002 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1003 %1 T0, A1
1004%if %4 != 0
1005 jz .unchanged_dst
1006%endif
1007 mov [A0], T0
1008.unchanged_dst:
1009 IEM_SAVE_FLAGS A2, %2, %3
1010 EPILOGUE_3_ARGS_EX 8
1011ENDPROC iemAImpl_ %+ %1 %+ _u64
1012
1013BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1014 PROLOGUE_3_ARGS
1015 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1016 %1 T1, A1
1017%if %4 != 0
1018 jz .unchanged_dst
1019%endif
1020 mov [A0], T1
1021 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1022 EPILOGUE_3_ARGS
1023.unchanged_dst:
1024 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1025 EPILOGUE_3_ARGS
1026ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1027
1028BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1029 PROLOGUE_3_ARGS
1030 %1 T0, A1
1031%if %4 != 0
1032 jz .unchanged_dst
1033%endif
1034 mov [A0], T0
1035.unchanged_dst:
1036 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1037 EPILOGUE_3_ARGS_EX 8
1038ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1039
1040 %endif ; RT_ARCH_AMD64
1041%endmacro
1042
1043IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1044IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1045IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1046IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1047
1048
1049;;
1050; Macro for implementing POPCNT.
1051;
1052; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1053; system where the 64-bit accesses requires hand coding.
1054;
1055; All the functions takes a pointer to the destination memory operand in A0,
1056; the source register operand in A1 and a pointer to eflags in A2.
1057;
1058; ASSUMES Intel and AMD set EFLAGS the same way.
1059;
1060; ASSUMES the instruction does not support memory destination.
1061;
1062; @param 1 The instruction mnemonic.
1063; @param 2 The modified flags.
1064; @param 3 The undefined flags.
1065;
1066%macro IEMIMPL_BIT_OP3 3
1067BEGINCODE
1068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1069 PROLOGUE_3_ARGS
1070 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1071 %1 T0_16, A1_16
1072 mov [A0], T0_16
1073 IEM_SAVE_FLAGS A2, %2, %3
1074 EPILOGUE_3_ARGS
1075ENDPROC iemAImpl_ %+ %1 %+ _u16
1076
1077BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1078 PROLOGUE_3_ARGS
1079 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1080 %1 T0_32, A1_32
1081 mov [A0], T0_32
1082 IEM_SAVE_FLAGS A2, %2, %3
1083 EPILOGUE_3_ARGS
1084ENDPROC iemAImpl_ %+ %1 %+ _u32
1085
1086 %ifdef RT_ARCH_AMD64
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0, A1
1091 mov [A0], T0
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS_EX 8
1094ENDPROC iemAImpl_ %+ %1 %+ _u64
1095 %endif ; RT_ARCH_AMD64
1096%endmacro
1097IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1098
1099
1100;
1101; IMUL is also a similar but yet different case (no lock, no mem dst).
1102; The rDX:rAX variant of imul is handled together with mul further down.
1103;
1104BEGINCODE
1105; @param 1 EFLAGS that are modified.
1106; @param 2 Undefined EFLAGS.
1107; @param 3 Function suffix.
1108; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1109; 2 for AMD (set AF, clear PF, ZF and SF).
1110%macro IEMIMPL_IMUL_TWO 4
1111BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1112 PROLOGUE_3_ARGS
1113 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1114 imul A1_16, word [A0]
1115 mov [A0], A1_16
1116 %if %4 != 1
1117 IEM_SAVE_FLAGS A2, %1, %2
1118 %else
1119 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1120 %endif
1121 EPILOGUE_3_ARGS
1122ENDPROC iemAImpl_imul_two_u16 %+ %3
1123
1124BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1125 PROLOGUE_3_ARGS
1126 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1127 imul A1_32, dword [A0]
1128 mov [A0], A1_32
1129 %if %4 != 1
1130 IEM_SAVE_FLAGS A2, %1, %2
1131 %else
1132 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1133 %endif
1134 EPILOGUE_3_ARGS
1135ENDPROC iemAImpl_imul_two_u32 %+ %3
1136
1137 %ifdef RT_ARCH_AMD64
1138BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1139 PROLOGUE_3_ARGS
1140 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1141 imul A1, qword [A0]
1142 mov [A0], A1
1143 %if %4 != 1
1144 IEM_SAVE_FLAGS A2, %1, %2
1145 %else
1146 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1147 %endif
1148 EPILOGUE_3_ARGS_EX 8
1149ENDPROC iemAImpl_imul_two_u64 %+ %3
1150 %endif ; RT_ARCH_AMD64
1151%endmacro
1152IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1153IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1154IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1155
1156
1157;
1158; XCHG for memory operands. This implies locking. No flag changes.
1159;
1160; Each function takes two arguments, first the pointer to the memory,
1161; then the pointer to the register. They all return void.
1162;
1163BEGINCODE
1164BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1165 PROLOGUE_2_ARGS
1166 mov T0_8, [A1]
1167 xchg [A0], T0_8
1168 mov [A1], T0_8
1169 EPILOGUE_2_ARGS
1170ENDPROC iemAImpl_xchg_u8_locked
1171
1172BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1173 PROLOGUE_2_ARGS
1174 mov T0_16, [A1]
1175 xchg [A0], T0_16
1176 mov [A1], T0_16
1177 EPILOGUE_2_ARGS
1178ENDPROC iemAImpl_xchg_u16_locked
1179
1180BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1181 PROLOGUE_2_ARGS
1182 mov T0_32, [A1]
1183 xchg [A0], T0_32
1184 mov [A1], T0_32
1185 EPILOGUE_2_ARGS
1186ENDPROC iemAImpl_xchg_u32_locked
1187
1188%ifdef RT_ARCH_AMD64
1189BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1190 PROLOGUE_2_ARGS
1191 mov T0, [A1]
1192 xchg [A0], T0
1193 mov [A1], T0
1194 EPILOGUE_2_ARGS
1195ENDPROC iemAImpl_xchg_u64_locked
1196%endif
1197
1198; Unlocked variants for fDisregardLock mode.
1199
1200BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_8, [A1]
1203 mov T1_8, [A0]
1204 mov [A0], T0_8
1205 mov [A1], T1_8
1206 EPILOGUE_2_ARGS
1207ENDPROC iemAImpl_xchg_u8_unlocked
1208
1209BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_16, [A1]
1212 mov T1_16, [A0]
1213 mov [A0], T0_16
1214 mov [A1], T1_16
1215 EPILOGUE_2_ARGS
1216ENDPROC iemAImpl_xchg_u16_unlocked
1217
1218BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1219 PROLOGUE_2_ARGS
1220 mov T0_32, [A1]
1221 mov T1_32, [A0]
1222 mov [A0], T0_32
1223 mov [A1], T1_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_unlocked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 mov T1, [A0]
1232 mov [A0], T0
1233 mov [A1], T1
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u64_unlocked
1236%endif
1237
1238
1239;
1240; XADD for memory operands.
1241;
1242; Each function takes three arguments, first the pointer to the
1243; memory/register, then the pointer to the register, and finally a pointer to
1244; eflags. They all return void.
1245;
1246BEGINCODE
1247BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1248 PROLOGUE_3_ARGS
1249 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1250 mov T0_8, [A1]
1251 xadd [A0], T0_8
1252 mov [A1], T0_8
1253 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1254 EPILOGUE_3_ARGS
1255ENDPROC iemAImpl_xadd_u8
1256
1257BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_16, [A1]
1261 xadd [A0], T0_16
1262 mov [A1], T0_16
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u16
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_32, [A1]
1271 xadd [A0], T0_32
1272 mov [A1], T0_32
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u32
1276
1277%ifdef RT_ARCH_AMD64
1278BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1279 PROLOGUE_3_ARGS
1280 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1281 mov T0, [A1]
1282 xadd [A0], T0
1283 mov [A1], T0
1284 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1285 EPILOGUE_3_ARGS
1286ENDPROC iemAImpl_xadd_u64
1287%endif ; RT_ARCH_AMD64
1288
1289BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1290 PROLOGUE_3_ARGS
1291 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1292 mov T0_8, [A1]
1293 lock xadd [A0], T0_8
1294 mov [A1], T0_8
1295 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1296 EPILOGUE_3_ARGS
1297ENDPROC iemAImpl_xadd_u8_locked
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_16, [A1]
1303 lock xadd [A0], T0_16
1304 mov [A1], T0_16
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u16_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_32, [A1]
1313 lock xadd [A0], T0_32
1314 mov [A1], T0_32
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u32_locked
1318
1319%ifdef RT_ARCH_AMD64
1320BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1321 PROLOGUE_3_ARGS
1322 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1323 mov T0, [A1]
1324 lock xadd [A0], T0
1325 mov [A1], T0
1326 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1327 EPILOGUE_3_ARGS
1328ENDPROC iemAImpl_xadd_u64_locked
1329%endif ; RT_ARCH_AMD64
1330
1331
1332;
1333; CMPXCHG8B.
1334;
1335; These are tricky register wise, so the code is duplicated for each calling
1336; convention.
1337;
1338; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1339;
1340; C-proto:
1341; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1342; uint32_t *pEFlags));
1343;
1344; Note! Identical to iemAImpl_cmpxchg16b.
1345;
1346BEGINCODE
1347BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1348%ifdef RT_ARCH_AMD64
1349 %ifdef ASM_CALL64_MSC
1350 push rbx
1351
1352 mov r11, rdx ; pu64EaxEdx (is also T1)
1353 mov r10, rcx ; pu64Dst
1354
1355 mov ebx, [r8]
1356 mov ecx, [r8 + 4]
1357 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1358 mov eax, [r11]
1359 mov edx, [r11 + 4]
1360
1361 lock cmpxchg8b [r10]
1362
1363 mov [r11], eax
1364 mov [r11 + 4], edx
1365 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1366
1367 pop rbx
1368 ret
1369 %else
1370 push rbx
1371
1372 mov r10, rcx ; pEFlags
1373 mov r11, rdx ; pu64EbxEcx (is also T1)
1374
1375 mov ebx, [r11]
1376 mov ecx, [r11 + 4]
1377 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1378 mov eax, [rsi]
1379 mov edx, [rsi + 4]
1380
1381 lock cmpxchg8b [rdi]
1382
1383 mov [rsi], eax
1384 mov [rsi + 4], edx
1385 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1386
1387 pop rbx
1388 ret
1389
1390 %endif
1391%else
1392 push esi
1393 push edi
1394 push ebx
1395 push ebp
1396
1397 mov edi, ecx ; pu64Dst
1398 mov esi, edx ; pu64EaxEdx
1399 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1400 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1401
1402 mov ebx, [ecx]
1403 mov ecx, [ecx + 4]
1404 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1405 mov eax, [esi]
1406 mov edx, [esi + 4]
1407
1408 lock cmpxchg8b [edi]
1409
1410 mov [esi], eax
1411 mov [esi + 4], edx
1412 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1413
1414 pop ebp
1415 pop ebx
1416 pop edi
1417 pop esi
1418 ret 8
1419%endif
1420ENDPROC iemAImpl_cmpxchg8b
1421
1422BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1423 ; Lazy bird always lock prefixes cmpxchg8b.
1424 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1425ENDPROC iemAImpl_cmpxchg8b_locked
1426
1427%ifdef RT_ARCH_AMD64
1428
1429;
1430; CMPXCHG16B.
1431;
1432; These are tricky register wise, so the code is duplicated for each calling
1433; convention.
1434;
1435; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1436;
1437; C-proto:
1438; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1439; uint32_t *pEFlags));
1440;
1441; Note! Identical to iemAImpl_cmpxchg8b.
1442;
1443BEGINCODE
1444BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1445 %ifdef ASM_CALL64_MSC
1446 push rbx
1447
1448 mov r11, rdx ; pu64RaxRdx (is also T1)
1449 mov r10, rcx ; pu64Dst
1450
1451 mov rbx, [r8]
1452 mov rcx, [r8 + 8]
1453 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1454 mov rax, [r11]
1455 mov rdx, [r11 + 8]
1456
1457 lock cmpxchg16b [r10]
1458
1459 mov [r11], rax
1460 mov [r11 + 8], rdx
1461 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1462
1463 pop rbx
1464 ret
1465 %else
1466 push rbx
1467
1468 mov r10, rcx ; pEFlags
1469 mov r11, rdx ; pu64RbxRcx (is also T1)
1470
1471 mov rbx, [r11]
1472 mov rcx, [r11 + 8]
1473 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1474 mov rax, [rsi]
1475 mov rdx, [rsi + 8]
1476
1477 lock cmpxchg16b [rdi]
1478
1479 mov [rsi], rax
1480 mov [rsi + 8], rdx
1481 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1482
1483 pop rbx
1484 ret
1485
1486 %endif
1487ENDPROC iemAImpl_cmpxchg16b
1488
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1490 ; Lazy bird always lock prefixes cmpxchg16b.
1491 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1492ENDPROC iemAImpl_cmpxchg16b_locked
1493
1494%endif ; RT_ARCH_AMD64
1495
1496
1497;
1498; CMPXCHG.
1499;
1500; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1501;
1502; C-proto:
1503; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1504;
1505BEGINCODE
1506%macro IEMIMPL_CMPXCHG 2
1507BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1508 PROLOGUE_4_ARGS
1509 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1510 mov al, [A1]
1511 %1 cmpxchg [A0], A2_8
1512 mov [A1], al
1513 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1514 EPILOGUE_4_ARGS
1515ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1516
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov ax, [A1]
1521 %1 cmpxchg [A0], A2_16
1522 mov [A1], ax
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov eax, [A1]
1531 %1 cmpxchg [A0], A2_32
1532 mov [A1], eax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1538%ifdef RT_ARCH_AMD64
1539 PROLOGUE_4_ARGS
1540 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1541 mov rax, [A1]
1542 %1 cmpxchg [A0], A2
1543 mov [A1], rax
1544 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1545 EPILOGUE_4_ARGS
1546%else
1547 ;
1548 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1549 ;
1550 push esi
1551 push edi
1552 push ebx
1553 push ebp
1554
1555 mov edi, ecx ; pu64Dst
1556 mov esi, edx ; pu64Rax
1557 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1558 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1559
1560 mov ebx, [ecx]
1561 mov ecx, [ecx + 4]
1562 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1563 mov eax, [esi]
1564 mov edx, [esi + 4]
1565
1566 lock cmpxchg8b [edi]
1567
1568 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1569 jz .cmpxchg8b_not_equal
1570 cmp eax, eax ; just set the other flags.
1571.store:
1572 mov [esi], eax
1573 mov [esi + 4], edx
1574 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1575
1576 pop ebp
1577 pop ebx
1578 pop edi
1579 pop esi
1580 ret 8
1581
1582.cmpxchg8b_not_equal:
1583 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1584 jne .store
1585 cmp [esi], eax
1586 jmp .store
1587
1588%endif
1589ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1590%endmacro ; IEMIMPL_CMPXCHG
1591
1592IEMIMPL_CMPXCHG , ,
1593IEMIMPL_CMPXCHG lock, _locked
1594
1595;;
1596; Macro for implementing a unary operator.
1597;
1598; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1599; variants, except on 32-bit system where the 64-bit accesses requires hand
1600; coding.
1601;
1602; All the functions takes a pointer to the destination memory operand in A0,
1603; the source register operand in A1 and a pointer to eflags in A2.
1604;
1605; @param 1 The instruction mnemonic.
1606; @param 2 The modified flags.
1607; @param 3 The undefined flags.
1608;
1609%macro IEMIMPL_UNARY_OP 3
1610BEGINCODE
1611BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1612 PROLOGUE_2_ARGS
1613 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1614 %1 byte [A0]
1615 IEM_SAVE_FLAGS A1, %2, %3
1616 EPILOGUE_2_ARGS
1617ENDPROC iemAImpl_ %+ %1 %+ _u8
1618
1619BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1620 PROLOGUE_2_ARGS
1621 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1622 lock %1 byte [A0]
1623 IEM_SAVE_FLAGS A1, %2, %3
1624 EPILOGUE_2_ARGS
1625ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1626
1627BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1628 PROLOGUE_2_ARGS
1629 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1630 %1 word [A0]
1631 IEM_SAVE_FLAGS A1, %2, %3
1632 EPILOGUE_2_ARGS
1633ENDPROC iemAImpl_ %+ %1 %+ _u16
1634
1635BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1636 PROLOGUE_2_ARGS
1637 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1638 lock %1 word [A0]
1639 IEM_SAVE_FLAGS A1, %2, %3
1640 EPILOGUE_2_ARGS
1641ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1642
1643BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1644 PROLOGUE_2_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1646 %1 dword [A0]
1647 IEM_SAVE_FLAGS A1, %2, %3
1648 EPILOGUE_2_ARGS
1649ENDPROC iemAImpl_ %+ %1 %+ _u32
1650
1651BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1652 PROLOGUE_2_ARGS
1653 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1654 lock %1 dword [A0]
1655 IEM_SAVE_FLAGS A1, %2, %3
1656 EPILOGUE_2_ARGS
1657ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1658
1659 %ifdef RT_ARCH_AMD64
1660BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1661 PROLOGUE_2_ARGS
1662 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1663 %1 qword [A0]
1664 IEM_SAVE_FLAGS A1, %2, %3
1665 EPILOGUE_2_ARGS
1666ENDPROC iemAImpl_ %+ %1 %+ _u64
1667
1668BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1669 PROLOGUE_2_ARGS
1670 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1671 lock %1 qword [A0]
1672 IEM_SAVE_FLAGS A1, %2, %3
1673 EPILOGUE_2_ARGS
1674ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1675 %endif ; RT_ARCH_AMD64
1676
1677%endmacro
1678
1679IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1680IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1681IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1682IEMIMPL_UNARY_OP not, 0, 0
1683
1684
1685;
1686; BSWAP. No flag changes.
1687;
1688; Each function takes one argument, pointer to the value to bswap
1689; (input/output). They all return void.
1690;
1691BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1692 PROLOGUE_1_ARGS
1693 mov T0_32, [A0] ; just in case any of the upper bits are used.
1694 db 66h
1695 bswap T0_32
1696 mov [A0], T0_32
1697 EPILOGUE_1_ARGS
1698ENDPROC iemAImpl_bswap_u16
1699
1700BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1701 PROLOGUE_1_ARGS
1702 mov T0_32, [A0]
1703 bswap T0_32
1704 mov [A0], T0_32
1705 EPILOGUE_1_ARGS
1706ENDPROC iemAImpl_bswap_u32
1707
1708BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1709%ifdef RT_ARCH_AMD64
1710 PROLOGUE_1_ARGS
1711 mov T0, [A0]
1712 bswap T0
1713 mov [A0], T0
1714 EPILOGUE_1_ARGS
1715%else
1716 PROLOGUE_1_ARGS
1717 mov T0, [A0]
1718 mov T1, [A0 + 4]
1719 bswap T0
1720 bswap T1
1721 mov [A0 + 4], T0
1722 mov [A0], T1
1723 EPILOGUE_1_ARGS
1724%endif
1725ENDPROC iemAImpl_bswap_u64
1726
1727
1728;;
1729; Macro for implementing a shift operation.
1730;
1731; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1732; 32-bit system where the 64-bit accesses requires hand coding.
1733;
1734; All the functions takes a pointer to the destination memory operand in A0,
1735; the shift count in A1 and a pointer to eflags in A2.
1736;
1737; @param 1 The instruction mnemonic.
1738; @param 2 The modified flags.
1739; @param 3 The undefined flags.
1740;
1741; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1742;
1743; @note the _intel and _amd variants are implemented in C.
1744;
1745%macro IEMIMPL_SHIFT_OP 3
1746BEGINCODE
1747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1748 PROLOGUE_3_ARGS
1749 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1750 %ifdef ASM_CALL64_GCC
1751 mov cl, A1_8
1752 %1 byte [A0], cl
1753 %else
1754 xchg A1, A0
1755 %1 byte [A1], cl
1756 %endif
1757 IEM_SAVE_FLAGS A2, %2, %3
1758 EPILOGUE_3_ARGS
1759ENDPROC iemAImpl_ %+ %1 %+ _u8
1760
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1762 PROLOGUE_3_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1764 %ifdef ASM_CALL64_GCC
1765 mov cl, A1_8
1766 %1 word [A0], cl
1767 %else
1768 xchg A1, A0
1769 %1 word [A1], cl
1770 %endif
1771 IEM_SAVE_FLAGS A2, %2, %3
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_ %+ %1 %+ _u16
1774
1775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1778 %ifdef ASM_CALL64_GCC
1779 mov cl, A1_8
1780 %1 dword [A0], cl
1781 %else
1782 xchg A1, A0
1783 %1 dword [A1], cl
1784 %endif
1785 IEM_SAVE_FLAGS A2, %2, %3
1786 EPILOGUE_3_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u32
1788
1789 %ifdef RT_ARCH_AMD64
1790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1791 PROLOGUE_3_ARGS
1792 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1793 %ifdef ASM_CALL64_GCC
1794 mov cl, A1_8
1795 %1 qword [A0], cl
1796 %else
1797 xchg A1, A0
1798 %1 qword [A1], cl
1799 %endif
1800 IEM_SAVE_FLAGS A2, %2, %3
1801 EPILOGUE_3_ARGS
1802ENDPROC iemAImpl_ %+ %1 %+ _u64
1803 %endif ; RT_ARCH_AMD64
1804
1805%endmacro
1806
1807IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1808IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1809IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1810IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1811IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1812IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1813IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1814
1815
1816;;
1817; Macro for implementing a double precision shift operation.
1818;
1819; This will generate code for the 16, 32 and 64 bit accesses, except on
1820; 32-bit system where the 64-bit accesses requires hand coding.
1821;
1822; The functions takes the destination operand (r/m) in A0, the source (reg) in
1823; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1824;
1825; @param 1 The instruction mnemonic.
1826; @param 2 The modified flags.
1827; @param 3 The undefined flags.
1828;
1829; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1830;
1831; @note the _intel and _amd variants are implemented in C.
1832;
1833%macro IEMIMPL_SHIFT_DBL_OP 3
1834BEGINCODE
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1836 PROLOGUE_4_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 xchg A3, A2
1840 %1 [A0], A1_16, cl
1841 xchg A3, A2
1842 %else
1843 xchg A0, A2
1844 %1 [A2], A1_16, cl
1845 %endif
1846 IEM_SAVE_FLAGS A3, %2, %3
1847 EPILOGUE_4_ARGS
1848ENDPROC iemAImpl_ %+ %1 %+ _u16
1849
1850BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1851 PROLOGUE_4_ARGS
1852 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1853 %ifdef ASM_CALL64_GCC
1854 xchg A3, A2
1855 %1 [A0], A1_32, cl
1856 xchg A3, A2
1857 %else
1858 xchg A0, A2
1859 %1 [A2], A1_32, cl
1860 %endif
1861 IEM_SAVE_FLAGS A3, %2, %3
1862 EPILOGUE_4_ARGS
1863ENDPROC iemAImpl_ %+ %1 %+ _u32
1864
1865 %ifdef RT_ARCH_AMD64
1866BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1867 PROLOGUE_4_ARGS
1868 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1869 %ifdef ASM_CALL64_GCC
1870 xchg A3, A2
1871 %1 [A0], A1, cl
1872 xchg A3, A2
1873 %else
1874 xchg A0, A2
1875 %1 [A2], A1, cl
1876 %endif
1877 IEM_SAVE_FLAGS A3, %2, %3
1878 EPILOGUE_4_ARGS_EX 12
1879ENDPROC iemAImpl_ %+ %1 %+ _u64
1880 %endif ; RT_ARCH_AMD64
1881
1882%endmacro
1883
1884IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1885IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1886
1887
1888;;
1889; Macro for implementing a multiplication operations.
1890;
1891; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1892; 32-bit system where the 64-bit accesses requires hand coding.
1893;
1894; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1895; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1896; pointer to eflags in A3.
1897;
1898; The functions all return 0 so the caller can be used for div/idiv as well as
1899; for the mul/imul implementation.
1900;
1901; @param 1 The instruction mnemonic.
1902; @param 2 The modified flags.
1903; @param 3 The undefined flags.
1904; @param 4 Name suffix.
1905; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1906;
1907; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1908;
1909%macro IEMIMPL_MUL_OP 5
1910BEGINCODE
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1912 PROLOGUE_3_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1914 mov al, [A0]
1915 %1 A1_8
1916 mov [A0], ax
1917 %if %5 != 1
1918 IEM_SAVE_FLAGS A2, %2, %3
1919 %else
1920 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1921 %endif
1922 xor eax, eax
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1927 PROLOGUE_4_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1929 mov ax, [A0]
1930 %ifdef ASM_CALL64_GCC
1931 %1 A2_16
1932 mov [A0], ax
1933 mov [A1], dx
1934 %else
1935 mov T1, A1
1936 %1 A2_16
1937 mov [A0], ax
1938 mov [T1], dx
1939 %endif
1940 %if %5 != 1
1941 IEM_SAVE_FLAGS A3, %2, %3
1942 %else
1943 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1944 %endif
1945 xor eax, eax
1946 EPILOGUE_4_ARGS
1947ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1948
1949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1950 PROLOGUE_4_ARGS
1951 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1952 mov eax, [A0]
1953 %ifdef ASM_CALL64_GCC
1954 %1 A2_32
1955 mov [A0], eax
1956 mov [A1], edx
1957 %else
1958 mov T1, A1
1959 %1 A2_32
1960 mov [A0], eax
1961 mov [T1], edx
1962 %endif
1963 %if %5 != 1
1964 IEM_SAVE_FLAGS A3, %2, %3
1965 %else
1966 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1967 %endif
1968 xor eax, eax
1969 EPILOGUE_4_ARGS
1970ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1971
1972 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1973BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1974 PROLOGUE_4_ARGS
1975 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1976 mov rax, [A0]
1977 %ifdef ASM_CALL64_GCC
1978 %1 A2
1979 mov [A0], rax
1980 mov [A1], rdx
1981 %else
1982 mov T1, A1
1983 %1 A2
1984 mov [A0], rax
1985 mov [T1], rdx
1986 %endif
1987 %if %5 != 1
1988 IEM_SAVE_FLAGS A3, %2, %3
1989 %else
1990 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
1991 %endif
1992 xor eax, eax
1993 EPILOGUE_4_ARGS_EX 12
1994ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1995 %endif ; !RT_ARCH_AMD64
1996
1997%endmacro
1998
1999IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2000IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2001IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2002IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2003IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2004IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2005
2006
2007BEGINCODE
2008;;
2009; Worker function for negating a 32-bit number in T1:T0
2010; @uses None (T0,T1)
2011BEGINPROC iemAImpl_negate_T0_T1_u32
2012 push 0
2013 push 0
2014 xchg T0_32, [xSP]
2015 xchg T1_32, [xSP + xCB]
2016 sub T0_32, [xSP]
2017 sbb T1_32, [xSP + xCB]
2018 add xSP, xCB*2
2019 ret
2020ENDPROC iemAImpl_negate_T0_T1_u32
2021
2022%ifdef RT_ARCH_AMD64
2023;;
2024; Worker function for negating a 64-bit number in T1:T0
2025; @uses None (T0,T1)
2026BEGINPROC iemAImpl_negate_T0_T1_u64
2027 push 0
2028 push 0
2029 xchg T0, [xSP]
2030 xchg T1, [xSP + xCB]
2031 sub T0, [xSP]
2032 sbb T1, [xSP + xCB]
2033 add xSP, xCB*2
2034 ret
2035ENDPROC iemAImpl_negate_T0_T1_u64
2036%endif
2037
2038
2039;;
2040; Macro for implementing a division operations.
2041;
2042; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2043; 32-bit system where the 64-bit accesses requires hand coding.
2044;
2045; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2046; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2047; pointer to eflags in A3.
2048;
2049; The functions all return 0 on success and -1 if a divide error should be
2050; raised by the caller.
2051;
2052; @param 1 The instruction mnemonic.
2053; @param 2 The modified flags.
2054; @param 3 The undefined flags.
2055; @param 4 1 if signed, 0 if unsigned.
2056; @param 5 Function suffix.
2057; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2058; 2 for AMD (set AF, clear PF, ZF and SF).
2059;
2060; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2061;
2062%macro IEMIMPL_DIV_OP 6
2063BEGINCODE
2064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2065 PROLOGUE_3_ARGS
2066
2067 ; div by chainsaw check.
2068 test A1_8, A1_8
2069 jz .div_zero
2070
2071 ; Overflow check - unsigned division is simple to verify, haven't
2072 ; found a simple way to check signed division yet unfortunately.
2073 %if %4 == 0
2074 cmp [A0 + 1], A1_8
2075 jae .div_overflow
2076 %else
2077 mov T0_16, [A0] ; T0 = dividend
2078 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2079 test A1_8, A1_8
2080 js .divisor_negative
2081 test T0_16, T0_16
2082 jns .both_positive
2083 neg T0_16
2084.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2085 push T0 ; Start off like unsigned below.
2086 shr T0_16, 7
2087 cmp T0_8, A1_8
2088 pop T0
2089 jb .div_no_overflow
2090 ja .div_overflow
2091 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2092 cmp T0_8, A1_8
2093 jae .div_overflow
2094 jmp .div_no_overflow
2095
2096.divisor_negative:
2097 neg A1_8
2098 test T0_16, T0_16
2099 jns .one_of_each
2100 neg T0_16
2101.both_positive: ; Same as unsigned shifted by sign indicator bit.
2102 shr T0_16, 7
2103 cmp T0_8, A1_8
2104 jae .div_overflow
2105.div_no_overflow:
2106 mov A1, T1 ; restore divisor
2107 %endif
2108
2109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2110 mov ax, [A0]
2111 %1 A1_8
2112 mov [A0], ax
2113 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2114 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2115 %else
2116 IEM_SAVE_FLAGS A2, %2, %3
2117 %endif
2118 xor eax, eax
2119
2120.return:
2121 EPILOGUE_3_ARGS
2122
2123.div_zero:
2124.div_overflow:
2125 mov eax, -1
2126 jmp .return
2127ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2128
2129BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2130 PROLOGUE_4_ARGS
2131
2132 ; div by chainsaw check.
2133 test A2_16, A2_16
2134 jz .div_zero
2135
2136 ; Overflow check - unsigned division is simple to verify, haven't
2137 ; found a simple way to check signed division yet unfortunately.
2138 %if %4 == 0
2139 cmp [A1], A2_16
2140 jae .div_overflow
2141 %else
2142 mov T0_16, [A1]
2143 shl T0_32, 16
2144 mov T0_16, [A0] ; T0 = dividend
2145 mov T1, A2 ; T1 = divisor
2146 test T1_16, T1_16
2147 js .divisor_negative
2148 test T0_32, T0_32
2149 jns .both_positive
2150 neg T0_32
2151.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2152 push T0 ; Start off like unsigned below.
2153 shr T0_32, 15
2154 cmp T0_16, T1_16
2155 pop T0
2156 jb .div_no_overflow
2157 ja .div_overflow
2158 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2159 cmp T0_16, T1_16
2160 jae .div_overflow
2161 jmp .div_no_overflow
2162
2163.divisor_negative:
2164 neg T1_16
2165 test T0_32, T0_32
2166 jns .one_of_each
2167 neg T0_32
2168.both_positive: ; Same as unsigned shifted by sign indicator bit.
2169 shr T0_32, 15
2170 cmp T0_16, T1_16
2171 jae .div_overflow
2172.div_no_overflow:
2173 %endif
2174
2175 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2176 %ifdef ASM_CALL64_GCC
2177 mov T1, A2
2178 mov ax, [A0]
2179 mov dx, [A1]
2180 %1 T1_16
2181 mov [A0], ax
2182 mov [A1], dx
2183 %else
2184 mov T1, A1
2185 mov ax, [A0]
2186 mov dx, [T1]
2187 %1 A2_16
2188 mov [A0], ax
2189 mov [T1], dx
2190 %endif
2191 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2192 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2193 %else
2194 IEM_SAVE_FLAGS A3, %2, %3
2195 %endif
2196 xor eax, eax
2197
2198.return:
2199 EPILOGUE_4_ARGS
2200
2201.div_zero:
2202.div_overflow:
2203 mov eax, -1
2204 jmp .return
2205ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2206
2207BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2208 PROLOGUE_4_ARGS
2209
2210 ; div by chainsaw check.
2211 test A2_32, A2_32
2212 jz .div_zero
2213
2214 ; Overflow check - unsigned division is simple to verify, haven't
2215 ; found a simple way to check signed division yet unfortunately.
2216 %if %4 == 0
2217 cmp [A1], A2_32
2218 jae .div_overflow
2219 %else
2220 push A2 ; save A2 so we modify it (we out of regs on x86).
2221 mov T0_32, [A0] ; T0 = dividend low
2222 mov T1_32, [A1] ; T1 = dividend high
2223 test A2_32, A2_32
2224 js .divisor_negative
2225 test T1_32, T1_32
2226 jns .both_positive
2227 call NAME(iemAImpl_negate_T0_T1_u32)
2228.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2229 push T0 ; Start off like unsigned below.
2230 shl T1_32, 1
2231 shr T0_32, 31
2232 or T1_32, T0_32
2233 cmp T1_32, A2_32
2234 pop T0
2235 jb .div_no_overflow
2236 ja .div_overflow
2237 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2238 cmp T0_32, A2_32
2239 jae .div_overflow
2240 jmp .div_no_overflow
2241
2242.divisor_negative:
2243 neg A2_32
2244 test T1_32, T1_32
2245 jns .one_of_each
2246 call NAME(iemAImpl_negate_T0_T1_u32)
2247.both_positive: ; Same as unsigned shifted by sign indicator bit.
2248 shl T1_32, 1
2249 shr T0_32, 31
2250 or T1_32, T0_32
2251 cmp T1_32, A2_32
2252 jae .div_overflow
2253.div_no_overflow:
2254 pop A2
2255 %endif
2256
2257 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2258 mov eax, [A0]
2259 %ifdef ASM_CALL64_GCC
2260 mov T1, A2
2261 mov eax, [A0]
2262 mov edx, [A1]
2263 %1 T1_32
2264 mov [A0], eax
2265 mov [A1], edx
2266 %else
2267 mov T1, A1
2268 mov eax, [A0]
2269 mov edx, [T1]
2270 %1 A2_32
2271 mov [A0], eax
2272 mov [T1], edx
2273 %endif
2274 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2275 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2276 %else
2277 IEM_SAVE_FLAGS A3, %2, %3
2278 %endif
2279 xor eax, eax
2280
2281.return:
2282 EPILOGUE_4_ARGS
2283
2284.div_overflow:
2285 %if %4 != 0
2286 pop A2
2287 %endif
2288.div_zero:
2289 mov eax, -1
2290 jmp .return
2291ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2292
2293 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2294BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2295 PROLOGUE_4_ARGS
2296
2297 test A2, A2
2298 jz .div_zero
2299 %if %4 == 0
2300 cmp [A1], A2
2301 jae .div_overflow
2302 %else
2303 push A2 ; save A2 so we modify it (we out of regs on x86).
2304 mov T0, [A0] ; T0 = dividend low
2305 mov T1, [A1] ; T1 = dividend high
2306 test A2, A2
2307 js .divisor_negative
2308 test T1, T1
2309 jns .both_positive
2310 call NAME(iemAImpl_negate_T0_T1_u64)
2311.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2312 push T0 ; Start off like unsigned below.
2313 shl T1, 1
2314 shr T0, 63
2315 or T1, T0
2316 cmp T1, A2
2317 pop T0
2318 jb .div_no_overflow
2319 ja .div_overflow
2320 mov T1, 0x7fffffffffffffff
2321 and T0, T1 ; Special case for covering (divisor - 1).
2322 cmp T0, A2
2323 jae .div_overflow
2324 jmp .div_no_overflow
2325
2326.divisor_negative:
2327 neg A2
2328 test T1, T1
2329 jns .one_of_each
2330 call NAME(iemAImpl_negate_T0_T1_u64)
2331.both_positive: ; Same as unsigned shifted by sign indicator bit.
2332 shl T1, 1
2333 shr T0, 63
2334 or T1, T0
2335 cmp T1, A2
2336 jae .div_overflow
2337.div_no_overflow:
2338 pop A2
2339 %endif
2340
2341 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2342 mov rax, [A0]
2343 %ifdef ASM_CALL64_GCC
2344 mov T1, A2
2345 mov rax, [A0]
2346 mov rdx, [A1]
2347 %1 T1
2348 mov [A0], rax
2349 mov [A1], rdx
2350 %else
2351 mov T1, A1
2352 mov rax, [A0]
2353 mov rdx, [T1]
2354 %1 A2
2355 mov [A0], rax
2356 mov [T1], rdx
2357 %endif
2358 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2359 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2360 %else
2361 IEM_SAVE_FLAGS A3, %2, %3
2362 %endif
2363 xor eax, eax
2364
2365.return:
2366 EPILOGUE_4_ARGS_EX 12
2367
2368.div_overflow:
2369 %if %4 != 0
2370 pop A2
2371 %endif
2372.div_zero:
2373 mov eax, -1
2374 jmp .return
2375ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2376 %endif ; !RT_ARCH_AMD64
2377
2378%endmacro
2379
2380IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2381IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2382IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2383IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2384IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2385IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2386
2387
2388;;
2389; Macro for implementing memory fence operation.
2390;
2391; No return value, no operands or anything.
2392;
2393; @param 1 The instruction.
2394;
2395%macro IEMIMPL_MEM_FENCE 1
2396BEGINCODE
2397BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2398 %1
2399 ret
2400ENDPROC iemAImpl_ %+ %1
2401%endmacro
2402
2403IEMIMPL_MEM_FENCE lfence
2404IEMIMPL_MEM_FENCE sfence
2405IEMIMPL_MEM_FENCE mfence
2406
2407;;
2408; Alternative for non-SSE2 host.
2409;
2410BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2411 push xAX
2412 xchg xAX, [xSP]
2413 add xSP, xCB
2414 ret
2415ENDPROC iemAImpl_alt_mem_fence
2416
2417
2418;;
2419; Initialize the FPU for the actual instruction being emulated, this means
2420; loading parts of the guest's control word and status word.
2421;
2422; @uses 24 bytes of stack. T0, T1
2423; @param 1 Expression giving the address of the FXSTATE of the guest.
2424;
2425%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2426 fnstenv [xSP]
2427
2428 ; FCW - for exception, precision and rounding control.
2429 movzx T0, word [%1 + X86FXSTATE.FCW]
2430 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2431 mov [xSP + X86FSTENV32P.FCW], T0_16
2432
2433 ; FSW - for undefined C0, C1, C2, and C3.
2434 movzx T1, word [%1 + X86FXSTATE.FSW]
2435 and T1, X86_FSW_C_MASK
2436 movzx T0, word [xSP + X86FSTENV32P.FSW]
2437 and T0, X86_FSW_TOP_MASK
2438 or T0, T1
2439 mov [xSP + X86FSTENV32P.FSW], T0_16
2440
2441 fldenv [xSP]
2442%endmacro
2443
2444
2445;;
2446; Initialize the FPU for the actual instruction being emulated, this means
2447; loading parts of the guest's control word, status word, and update the
2448; tag word for the top register if it's empty.
2449;
2450; ASSUMES actual TOP=7
2451;
2452; @uses 24 bytes of stack. T0, T1
2453; @param 1 Expression giving the address of the FXSTATE of the guest.
2454;
2455%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2456 fnstenv [xSP]
2457
2458 ; FCW - for exception, precision and rounding control.
2459 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2460 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2461 mov [xSP + X86FSTENV32P.FCW], T0_16
2462
2463 ; FSW - for undefined C0, C1, C2, and C3.
2464 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2465 and T1_32, X86_FSW_C_MASK
2466 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2467 and T0_32, X86_FSW_TOP_MASK
2468 or T0_32, T1_32
2469 mov [xSP + X86FSTENV32P.FSW], T0_16
2470
2471 ; FTW - Only for ST0 (in/out).
2472 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2473 shr T1_32, X86_FSW_TOP_SHIFT
2474 and T1_32, X86_FSW_TOP_SMASK
2475 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2476 jc %%st0_not_empty
2477 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2478%%st0_not_empty:
2479
2480 fldenv [xSP]
2481%endmacro
2482
2483
2484;;
2485; Need to move this as well somewhere better?
2486;
2487struc IEMFPURESULT
2488 .r80Result resw 5
2489 .FSW resw 1
2490endstruc
2491
2492
2493;;
2494; Need to move this as well somewhere better?
2495;
2496struc IEMFPURESULTTWO
2497 .r80Result1 resw 5
2498 .FSW resw 1
2499 .r80Result2 resw 5
2500endstruc
2501
2502
2503;
2504;---------------------- 16-bit signed integer operations ----------------------
2505;
2506
2507
2508;;
2509; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2510;
2511; @param A0 FPU context (fxsave).
2512; @param A1 Pointer to a IEMFPURESULT for the output.
2513; @param A2 Pointer to the 16-bit floating point value to convert.
2514;
2515BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2516 PROLOGUE_3_ARGS
2517 sub xSP, 20h
2518
2519 fninit
2520 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2521 fild word [A2]
2522
2523 fnstsw word [A1 + IEMFPURESULT.FSW]
2524 fnclex
2525 fstp tword [A1 + IEMFPURESULT.r80Result]
2526
2527 fninit
2528 add xSP, 20h
2529 EPILOGUE_3_ARGS
2530ENDPROC iemAImpl_fild_r80_from_i16
2531
2532
2533;;
2534; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2535;
2536; @param A0 FPU context (fxsave).
2537; @param A1 Where to return the output FSW.
2538; @param A2 Where to store the 16-bit signed integer value.
2539; @param A3 Pointer to the 80-bit value.
2540;
2541BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2542 PROLOGUE_4_ARGS
2543 sub xSP, 20h
2544
2545 fninit
2546 fld tword [A3]
2547 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2548 fistp word [A2]
2549
2550 fnstsw word [A1]
2551
2552 fninit
2553 add xSP, 20h
2554 EPILOGUE_4_ARGS
2555ENDPROC iemAImpl_fist_r80_to_i16
2556
2557
2558;;
2559; Store a 80-bit floating point value (register) as a 16-bit signed integer
2560; (memory) with truncation.
2561;
2562; @param A0 FPU context (fxsave).
2563; @param A1 Where to return the output FSW.
2564; @param A2 Where to store the 16-bit signed integer value.
2565; @param A3 Pointer to the 80-bit value.
2566;
2567BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2568 PROLOGUE_4_ARGS
2569 sub xSP, 20h
2570
2571 fninit
2572 fld tword [A3]
2573 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2574 fisttp word [A2]
2575
2576 fnstsw word [A1]
2577
2578 fninit
2579 add xSP, 20h
2580 EPILOGUE_4_ARGS
2581ENDPROC iemAImpl_fistt_r80_to_i16
2582
2583
2584;;
2585; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2586;
2587; @param 1 The instruction
2588;
2589; @param A0 FPU context (fxsave).
2590; @param A1 Pointer to a IEMFPURESULT for the output.
2591; @param A2 Pointer to the 80-bit value.
2592; @param A3 Pointer to the 16-bit value.
2593;
2594%macro IEMIMPL_FPU_R80_BY_I16 1
2595BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2596 PROLOGUE_4_ARGS
2597 sub xSP, 20h
2598
2599 fninit
2600 fld tword [A2]
2601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2602 %1 word [A3]
2603
2604 fnstsw word [A1 + IEMFPURESULT.FSW]
2605 fnclex
2606 fstp tword [A1 + IEMFPURESULT.r80Result]
2607
2608 fninit
2609 add xSP, 20h
2610 EPILOGUE_4_ARGS
2611ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2612%endmacro
2613
2614IEMIMPL_FPU_R80_BY_I16 fiadd
2615IEMIMPL_FPU_R80_BY_I16 fimul
2616IEMIMPL_FPU_R80_BY_I16 fisub
2617IEMIMPL_FPU_R80_BY_I16 fisubr
2618IEMIMPL_FPU_R80_BY_I16 fidiv
2619IEMIMPL_FPU_R80_BY_I16 fidivr
2620
2621
2622;;
2623; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2624; only returning FSW.
2625;
2626; @param 1 The instruction
2627;
2628; @param A0 FPU context (fxsave).
2629; @param A1 Where to store the output FSW.
2630; @param A2 Pointer to the 80-bit value.
2631; @param A3 Pointer to the 64-bit value.
2632;
2633%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2635 PROLOGUE_4_ARGS
2636 sub xSP, 20h
2637
2638 fninit
2639 fld tword [A2]
2640 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2641 %1 word [A3]
2642
2643 fnstsw word [A1]
2644
2645 fninit
2646 add xSP, 20h
2647 EPILOGUE_4_ARGS
2648ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2649%endmacro
2650
2651IEMIMPL_FPU_R80_BY_I16_FSW ficom
2652
2653
2654
2655;
2656;---------------------- 32-bit signed integer operations ----------------------
2657;
2658
2659
2660;;
2661; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2662;
2663; @param A0 FPU context (fxsave).
2664; @param A1 Pointer to a IEMFPURESULT for the output.
2665; @param A2 Pointer to the 32-bit floating point value to convert.
2666;
2667BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2668 PROLOGUE_3_ARGS
2669 sub xSP, 20h
2670
2671 fninit
2672 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2673 fild dword [A2]
2674
2675 fnstsw word [A1 + IEMFPURESULT.FSW]
2676 fnclex
2677 fstp tword [A1 + IEMFPURESULT.r80Result]
2678
2679 fninit
2680 add xSP, 20h
2681 EPILOGUE_3_ARGS
2682ENDPROC iemAImpl_fild_r80_from_i32
2683
2684
2685;;
2686; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2687;
2688; @param A0 FPU context (fxsave).
2689; @param A1 Where to return the output FSW.
2690; @param A2 Where to store the 32-bit signed integer value.
2691; @param A3 Pointer to the 80-bit value.
2692;
2693BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2694 PROLOGUE_4_ARGS
2695 sub xSP, 20h
2696
2697 fninit
2698 fld tword [A3]
2699 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700 fistp dword [A2]
2701
2702 fnstsw word [A1]
2703
2704 fninit
2705 add xSP, 20h
2706 EPILOGUE_4_ARGS
2707ENDPROC iemAImpl_fist_r80_to_i32
2708
2709
2710;;
2711; Store a 80-bit floating point value (register) as a 32-bit signed integer
2712; (memory) with truncation.
2713;
2714; @param A0 FPU context (fxsave).
2715; @param A1 Where to return the output FSW.
2716; @param A2 Where to store the 32-bit signed integer value.
2717; @param A3 Pointer to the 80-bit value.
2718;
2719BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2720 PROLOGUE_4_ARGS
2721 sub xSP, 20h
2722
2723 fninit
2724 fld tword [A3]
2725 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2726 fisttp dword [A2]
2727
2728 fnstsw word [A1]
2729
2730 fninit
2731 add xSP, 20h
2732 EPILOGUE_4_ARGS
2733ENDPROC iemAImpl_fistt_r80_to_i32
2734
2735
2736;;
2737; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2738;
2739; @param 1 The instruction
2740;
2741; @param A0 FPU context (fxsave).
2742; @param A1 Pointer to a IEMFPURESULT for the output.
2743; @param A2 Pointer to the 80-bit value.
2744; @param A3 Pointer to the 32-bit value.
2745;
2746%macro IEMIMPL_FPU_R80_BY_I32 1
2747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2748 PROLOGUE_4_ARGS
2749 sub xSP, 20h
2750
2751 fninit
2752 fld tword [A2]
2753 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754 %1 dword [A3]
2755
2756 fnstsw word [A1 + IEMFPURESULT.FSW]
2757 fnclex
2758 fstp tword [A1 + IEMFPURESULT.r80Result]
2759
2760 fninit
2761 add xSP, 20h
2762 EPILOGUE_4_ARGS
2763ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2764%endmacro
2765
2766IEMIMPL_FPU_R80_BY_I32 fiadd
2767IEMIMPL_FPU_R80_BY_I32 fimul
2768IEMIMPL_FPU_R80_BY_I32 fisub
2769IEMIMPL_FPU_R80_BY_I32 fisubr
2770IEMIMPL_FPU_R80_BY_I32 fidiv
2771IEMIMPL_FPU_R80_BY_I32 fidivr
2772
2773
2774;;
2775; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2776; only returning FSW.
2777;
2778; @param 1 The instruction
2779;
2780; @param A0 FPU context (fxsave).
2781; @param A1 Where to store the output FSW.
2782; @param A2 Pointer to the 80-bit value.
2783; @param A3 Pointer to the 64-bit value.
2784;
2785%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2787 PROLOGUE_4_ARGS
2788 sub xSP, 20h
2789
2790 fninit
2791 fld tword [A2]
2792 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2793 %1 dword [A3]
2794
2795 fnstsw word [A1]
2796
2797 fninit
2798 add xSP, 20h
2799 EPILOGUE_4_ARGS
2800ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2801%endmacro
2802
2803IEMIMPL_FPU_R80_BY_I32_FSW ficom
2804
2805
2806
2807;
2808;---------------------- 64-bit signed integer operations ----------------------
2809;
2810
2811
2812;;
2813; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2814;
2815; @param A0 FPU context (fxsave).
2816; @param A1 Pointer to a IEMFPURESULT for the output.
2817; @param A2 Pointer to the 64-bit floating point value to convert.
2818;
2819BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2820 PROLOGUE_3_ARGS
2821 sub xSP, 20h
2822
2823 fninit
2824 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2825 fild qword [A2]
2826
2827 fnstsw word [A1 + IEMFPURESULT.FSW]
2828 fnclex
2829 fstp tword [A1 + IEMFPURESULT.r80Result]
2830
2831 fninit
2832 add xSP, 20h
2833 EPILOGUE_3_ARGS
2834ENDPROC iemAImpl_fild_r80_from_i64
2835
2836
2837;;
2838; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2839;
2840; @param A0 FPU context (fxsave).
2841; @param A1 Where to return the output FSW.
2842; @param A2 Where to store the 64-bit signed integer value.
2843; @param A3 Pointer to the 80-bit value.
2844;
2845BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2846 PROLOGUE_4_ARGS
2847 sub xSP, 20h
2848
2849 fninit
2850 fld tword [A3]
2851 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2852 fistp qword [A2]
2853
2854 fnstsw word [A1]
2855
2856 fninit
2857 add xSP, 20h
2858 EPILOGUE_4_ARGS
2859ENDPROC iemAImpl_fist_r80_to_i64
2860
2861
2862;;
2863; Store a 80-bit floating point value (register) as a 64-bit signed integer
2864; (memory) with truncation.
2865;
2866; @param A0 FPU context (fxsave).
2867; @param A1 Where to return the output FSW.
2868; @param A2 Where to store the 64-bit signed integer value.
2869; @param A3 Pointer to the 80-bit value.
2870;
2871BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2872 PROLOGUE_4_ARGS
2873 sub xSP, 20h
2874
2875 fninit
2876 fld tword [A3]
2877 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2878 fisttp qword [A2]
2879
2880 fnstsw word [A1]
2881
2882 fninit
2883 add xSP, 20h
2884 EPILOGUE_4_ARGS
2885ENDPROC iemAImpl_fistt_r80_to_i64
2886
2887
2888
2889;
2890;---------------------- 32-bit floating point operations ----------------------
2891;
2892
2893;;
2894; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2895;
2896; @param A0 FPU context (fxsave).
2897; @param A1 Pointer to a IEMFPURESULT for the output.
2898; @param A2 Pointer to the 32-bit floating point value to convert.
2899;
2900BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2901 PROLOGUE_3_ARGS
2902 sub xSP, 20h
2903
2904 fninit
2905 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906 fld dword [A2]
2907
2908 fnstsw word [A1 + IEMFPURESULT.FSW]
2909 fnclex
2910 fstp tword [A1 + IEMFPURESULT.r80Result]
2911
2912 fninit
2913 add xSP, 20h
2914 EPILOGUE_3_ARGS
2915ENDPROC iemAImpl_fld_r80_from_r32
2916
2917
2918;;
2919; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2920;
2921; @param A0 FPU context (fxsave).
2922; @param A1 Where to return the output FSW.
2923; @param A2 Where to store the 32-bit value.
2924; @param A3 Pointer to the 80-bit value.
2925;
2926BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2927 PROLOGUE_4_ARGS
2928 sub xSP, 20h
2929
2930 fninit
2931 fld tword [A3]
2932 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2933 fst dword [A2]
2934
2935 fnstsw word [A1]
2936
2937 fninit
2938 add xSP, 20h
2939 EPILOGUE_4_ARGS
2940ENDPROC iemAImpl_fst_r80_to_r32
2941
2942
2943;;
2944; FPU instruction working on one 80-bit and one 32-bit floating point value.
2945;
2946; @param 1 The instruction
2947;
2948; @param A0 FPU context (fxsave).
2949; @param A1 Pointer to a IEMFPURESULT for the output.
2950; @param A2 Pointer to the 80-bit value.
2951; @param A3 Pointer to the 32-bit value.
2952;
2953%macro IEMIMPL_FPU_R80_BY_R32 1
2954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2955 PROLOGUE_4_ARGS
2956 sub xSP, 20h
2957
2958 fninit
2959 fld tword [A2]
2960 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2961 %1 dword [A3]
2962
2963 fnstsw word [A1 + IEMFPURESULT.FSW]
2964 fnclex
2965 fstp tword [A1 + IEMFPURESULT.r80Result]
2966
2967 fninit
2968 add xSP, 20h
2969 EPILOGUE_4_ARGS
2970ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2971%endmacro
2972
2973IEMIMPL_FPU_R80_BY_R32 fadd
2974IEMIMPL_FPU_R80_BY_R32 fmul
2975IEMIMPL_FPU_R80_BY_R32 fsub
2976IEMIMPL_FPU_R80_BY_R32 fsubr
2977IEMIMPL_FPU_R80_BY_R32 fdiv
2978IEMIMPL_FPU_R80_BY_R32 fdivr
2979
2980
2981;;
2982; FPU instruction working on one 80-bit and one 32-bit floating point value,
2983; only returning FSW.
2984;
2985; @param 1 The instruction
2986;
2987; @param A0 FPU context (fxsave).
2988; @param A1 Where to store the output FSW.
2989; @param A2 Pointer to the 80-bit value.
2990; @param A3 Pointer to the 64-bit value.
2991;
2992%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2993BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2994 PROLOGUE_4_ARGS
2995 sub xSP, 20h
2996
2997 fninit
2998 fld tword [A2]
2999 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3000 %1 dword [A3]
3001
3002 fnstsw word [A1]
3003
3004 fninit
3005 add xSP, 20h
3006 EPILOGUE_4_ARGS
3007ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3008%endmacro
3009
3010IEMIMPL_FPU_R80_BY_R32_FSW fcom
3011
3012
3013
3014;
3015;---------------------- 64-bit floating point operations ----------------------
3016;
3017
3018;;
3019; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3020;
3021; @param A0 FPU context (fxsave).
3022; @param A1 Pointer to a IEMFPURESULT for the output.
3023; @param A2 Pointer to the 64-bit floating point value to convert.
3024;
3025BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3026 PROLOGUE_3_ARGS
3027 sub xSP, 20h
3028
3029 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3030 fld qword [A2]
3031
3032 fnstsw word [A1 + IEMFPURESULT.FSW]
3033 fnclex
3034 fstp tword [A1 + IEMFPURESULT.r80Result]
3035
3036 fninit
3037 add xSP, 20h
3038 EPILOGUE_3_ARGS
3039ENDPROC iemAImpl_fld_r80_from_r64
3040
3041
3042;;
3043; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3044;
3045; @param A0 FPU context (fxsave).
3046; @param A1 Where to return the output FSW.
3047; @param A2 Where to store the 64-bit value.
3048; @param A3 Pointer to the 80-bit value.
3049;
3050BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3051 PROLOGUE_4_ARGS
3052 sub xSP, 20h
3053
3054 fninit
3055 fld tword [A3]
3056 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3057 fst qword [A2]
3058
3059 fnstsw word [A1]
3060
3061 fninit
3062 add xSP, 20h
3063 EPILOGUE_4_ARGS
3064ENDPROC iemAImpl_fst_r80_to_r64
3065
3066
3067;;
3068; FPU instruction working on one 80-bit and one 64-bit floating point value.
3069;
3070; @param 1 The instruction
3071;
3072; @param A0 FPU context (fxsave).
3073; @param A1 Pointer to a IEMFPURESULT for the output.
3074; @param A2 Pointer to the 80-bit value.
3075; @param A3 Pointer to the 64-bit value.
3076;
3077%macro IEMIMPL_FPU_R80_BY_R64 1
3078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3079 PROLOGUE_4_ARGS
3080 sub xSP, 20h
3081
3082 fninit
3083 fld tword [A2]
3084 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3085 %1 qword [A3]
3086
3087 fnstsw word [A1 + IEMFPURESULT.FSW]
3088 fnclex
3089 fstp tword [A1 + IEMFPURESULT.r80Result]
3090
3091 fninit
3092 add xSP, 20h
3093 EPILOGUE_4_ARGS
3094ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3095%endmacro
3096
3097IEMIMPL_FPU_R80_BY_R64 fadd
3098IEMIMPL_FPU_R80_BY_R64 fmul
3099IEMIMPL_FPU_R80_BY_R64 fsub
3100IEMIMPL_FPU_R80_BY_R64 fsubr
3101IEMIMPL_FPU_R80_BY_R64 fdiv
3102IEMIMPL_FPU_R80_BY_R64 fdivr
3103
3104;;
3105; FPU instruction working on one 80-bit and one 64-bit floating point value,
3106; only returning FSW.
3107;
3108; @param 1 The instruction
3109;
3110; @param A0 FPU context (fxsave).
3111; @param A1 Where to store the output FSW.
3112; @param A2 Pointer to the 80-bit value.
3113; @param A3 Pointer to the 64-bit value.
3114;
3115%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3117 PROLOGUE_4_ARGS
3118 sub xSP, 20h
3119
3120 fninit
3121 fld tword [A2]
3122 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3123 %1 qword [A3]
3124
3125 fnstsw word [A1]
3126
3127 fninit
3128 add xSP, 20h
3129 EPILOGUE_4_ARGS
3130ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3131%endmacro
3132
3133IEMIMPL_FPU_R80_BY_R64_FSW fcom
3134
3135
3136
3137;
3138;---------------------- 80-bit floating point operations ----------------------
3139;
3140
3141;;
3142; Loads a 80-bit floating point register value from memory.
3143;
3144; @param A0 FPU context (fxsave).
3145; @param A1 Pointer to a IEMFPURESULT for the output.
3146; @param A2 Pointer to the 80-bit floating point value to load.
3147;
3148BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3149 PROLOGUE_3_ARGS
3150 sub xSP, 20h
3151
3152 fninit
3153 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3154 fld tword [A2]
3155
3156 fnstsw word [A1 + IEMFPURESULT.FSW]
3157 fnclex
3158 fstp tword [A1 + IEMFPURESULT.r80Result]
3159
3160 fninit
3161 add xSP, 20h
3162 EPILOGUE_3_ARGS
3163ENDPROC iemAImpl_fld_r80_from_r80
3164
3165
3166;;
3167; Store a 80-bit floating point register to memory
3168;
3169; @param A0 FPU context (fxsave).
3170; @param A1 Where to return the output FSW.
3171; @param A2 Where to store the 80-bit value.
3172; @param A3 Pointer to the 80-bit register value.
3173;
3174BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3175 PROLOGUE_4_ARGS
3176 sub xSP, 20h
3177
3178 fninit
3179 fld tword [A3]
3180 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3181 fstp tword [A2]
3182
3183 fnstsw word [A1]
3184
3185 fninit
3186 add xSP, 20h
3187 EPILOGUE_4_ARGS
3188ENDPROC iemAImpl_fst_r80_to_r80
3189
3190
3191;;
3192; Loads an 80-bit floating point register value in BCD format from memory.
3193;
3194; @param A0 FPU context (fxsave).
3195; @param A1 Pointer to a IEMFPURESULT for the output.
3196; @param A2 Pointer to the 80-bit BCD value to load.
3197;
3198BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3199 PROLOGUE_3_ARGS
3200 sub xSP, 20h
3201
3202 fninit
3203 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3204 fbld tword [A2]
3205
3206 fnstsw word [A1 + IEMFPURESULT.FSW]
3207 fnclex
3208 fstp tword [A1 + IEMFPURESULT.r80Result]
3209
3210 fninit
3211 add xSP, 20h
3212 EPILOGUE_3_ARGS
3213ENDPROC iemAImpl_fld_r80_from_d80
3214
3215
3216;;
3217; Store a 80-bit floating point register to memory as BCD
3218;
3219; @param A0 FPU context (fxsave).
3220; @param A1 Where to return the output FSW.
3221; @param A2 Where to store the 80-bit BCD value.
3222; @param A3 Pointer to the 80-bit register value.
3223;
3224BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3225 PROLOGUE_4_ARGS
3226 sub xSP, 20h
3227
3228 fninit
3229 fld tword [A3]
3230 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3231 fbstp tword [A2]
3232
3233 fnstsw word [A1]
3234
3235 fninit
3236 add xSP, 20h
3237 EPILOGUE_4_ARGS
3238ENDPROC iemAImpl_fst_r80_to_d80
3239
3240
3241;;
3242; FPU instruction working on two 80-bit floating point values.
3243;
3244; @param 1 The instruction
3245;
3246; @param A0 FPU context (fxsave).
3247; @param A1 Pointer to a IEMFPURESULT for the output.
3248; @param A2 Pointer to the first 80-bit value (ST0)
3249; @param A3 Pointer to the second 80-bit value (STn).
3250;
3251%macro IEMIMPL_FPU_R80_BY_R80 2
3252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3253 PROLOGUE_4_ARGS
3254 sub xSP, 20h
3255
3256 fninit
3257 fld tword [A3]
3258 fld tword [A2]
3259 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3260 %1 %2
3261
3262 fnstsw word [A1 + IEMFPURESULT.FSW]
3263 fnclex
3264 fstp tword [A1 + IEMFPURESULT.r80Result]
3265
3266 fninit
3267 add xSP, 20h
3268 EPILOGUE_4_ARGS
3269ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3270%endmacro
3271
3272IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3273IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3274IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3275IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3276IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3277IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3278IEMIMPL_FPU_R80_BY_R80 fprem, {}
3279IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3280IEMIMPL_FPU_R80_BY_R80 fscale, {}
3281
3282
3283;;
3284; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3285; storing the result in ST1 and popping the stack.
3286;
3287; @param 1 The instruction
3288;
3289; @param A0 FPU context (fxsave).
3290; @param A1 Pointer to a IEMFPURESULT for the output.
3291; @param A2 Pointer to the first 80-bit value (ST1).
3292; @param A3 Pointer to the second 80-bit value (ST0).
3293;
3294%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3295BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3296 PROLOGUE_4_ARGS
3297 sub xSP, 20h
3298
3299 fninit
3300 fld tword [A2]
3301 fld tword [A3]
3302 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3303 %1
3304
3305 fnstsw word [A1 + IEMFPURESULT.FSW]
3306 fnclex
3307 fstp tword [A1 + IEMFPURESULT.r80Result]
3308
3309 fninit
3310 add xSP, 20h
3311 EPILOGUE_4_ARGS
3312ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3313%endmacro
3314
3315IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3316IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3317IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3318
3319
3320;;
3321; FPU instruction working on two 80-bit floating point values, only
3322; returning FSW.
3323;
3324; @param 1 The instruction
3325;
3326; @param A0 FPU context (fxsave).
3327; @param A1 Pointer to a uint16_t for the resulting FSW.
3328; @param A2 Pointer to the first 80-bit value.
3329; @param A3 Pointer to the second 80-bit value.
3330;
3331%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3332BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3333 PROLOGUE_4_ARGS
3334 sub xSP, 20h
3335
3336 fninit
3337 fld tword [A3]
3338 fld tword [A2]
3339 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3340 %1 st0, st1
3341
3342 fnstsw word [A1]
3343
3344 fninit
3345 add xSP, 20h
3346 EPILOGUE_4_ARGS
3347ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3348%endmacro
3349
3350IEMIMPL_FPU_R80_BY_R80_FSW fcom
3351IEMIMPL_FPU_R80_BY_R80_FSW fucom
3352
3353
3354;;
3355; FPU instruction working on two 80-bit floating point values,
3356; returning FSW and EFLAGS (eax).
3357;
3358; @param 1 The instruction
3359;
3360; @returns EFLAGS in EAX.
3361; @param A0 FPU context (fxsave).
3362; @param A1 Pointer to a uint16_t for the resulting FSW.
3363; @param A2 Pointer to the first 80-bit value.
3364; @param A3 Pointer to the second 80-bit value.
3365;
3366%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3367BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3368 PROLOGUE_4_ARGS
3369 sub xSP, 20h
3370
3371 fninit
3372 fld tword [A3]
3373 fld tword [A2]
3374 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3375 %1 st1
3376
3377 fnstsw word [A1]
3378 pushf
3379 pop xAX
3380
3381 fninit
3382 add xSP, 20h
3383 EPILOGUE_4_ARGS
3384ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3385%endmacro
3386
3387IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3388IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3389
3390
3391;;
3392; FPU instruction working on one 80-bit floating point value.
3393;
3394; @param 1 The instruction
3395;
3396; @param A0 FPU context (fxsave).
3397; @param A1 Pointer to a IEMFPURESULT for the output.
3398; @param A2 Pointer to the 80-bit value.
3399;
3400%macro IEMIMPL_FPU_R80 1
3401BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3402 PROLOGUE_3_ARGS
3403 sub xSP, 20h
3404
3405 fninit
3406 fld tword [A2]
3407 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3408 %1
3409
3410 fnstsw word [A1 + IEMFPURESULT.FSW]
3411 fnclex
3412 fstp tword [A1 + IEMFPURESULT.r80Result]
3413
3414 fninit
3415 add xSP, 20h
3416 EPILOGUE_3_ARGS
3417ENDPROC iemAImpl_ %+ %1 %+ _r80
3418%endmacro
3419
3420IEMIMPL_FPU_R80 fchs
3421IEMIMPL_FPU_R80 fabs
3422IEMIMPL_FPU_R80 f2xm1
3423IEMIMPL_FPU_R80 fsqrt
3424IEMIMPL_FPU_R80 frndint
3425IEMIMPL_FPU_R80 fsin
3426IEMIMPL_FPU_R80 fcos
3427
3428
3429;;
3430; FPU instruction working on one 80-bit floating point value, only
3431; returning FSW.
3432;
3433; @param 1 The instruction
3434; @param 2 Non-zero to also restore FTW.
3435;
3436; @param A0 FPU context (fxsave).
3437; @param A1 Pointer to a uint16_t for the resulting FSW.
3438; @param A2 Pointer to the 80-bit value.
3439;
3440%macro IEMIMPL_FPU_R80_FSW 2
3441BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3442 PROLOGUE_3_ARGS
3443 sub xSP, 20h
3444
3445 fninit
3446 fld tword [A2]
3447%if %2 != 0
3448 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3449%else
3450 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3451%endif
3452 %1
3453
3454 fnstsw word [A1]
3455
3456 fninit
3457 add xSP, 20h
3458 EPILOGUE_3_ARGS
3459ENDPROC iemAImpl_ %+ %1 %+ _r80
3460%endmacro
3461
3462IEMIMPL_FPU_R80_FSW ftst, 0
3463IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3464
3465
3466
3467;;
3468; FPU instruction loading a 80-bit floating point constant.
3469;
3470; @param 1 The instruction
3471;
3472; @param A0 FPU context (fxsave).
3473; @param A1 Pointer to a IEMFPURESULT for the output.
3474;
3475%macro IEMIMPL_FPU_R80_CONST 1
3476BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3477 PROLOGUE_2_ARGS
3478 sub xSP, 20h
3479
3480 fninit
3481 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3482 %1
3483
3484 fnstsw word [A1 + IEMFPURESULT.FSW]
3485 fnclex
3486 fstp tword [A1 + IEMFPURESULT.r80Result]
3487
3488 fninit
3489 add xSP, 20h
3490 EPILOGUE_2_ARGS
3491ENDPROC iemAImpl_ %+ %1 %+
3492%endmacro
3493
3494IEMIMPL_FPU_R80_CONST fld1
3495IEMIMPL_FPU_R80_CONST fldl2t
3496IEMIMPL_FPU_R80_CONST fldl2e
3497IEMIMPL_FPU_R80_CONST fldpi
3498IEMIMPL_FPU_R80_CONST fldlg2
3499IEMIMPL_FPU_R80_CONST fldln2
3500IEMIMPL_FPU_R80_CONST fldz
3501
3502
3503;;
3504; FPU instruction working on one 80-bit floating point value, outputing two.
3505;
3506; @param 1 The instruction
3507;
3508; @param A0 FPU context (fxsave).
3509; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3510; @param A2 Pointer to the 80-bit value.
3511;
3512%macro IEMIMPL_FPU_R80_R80 1
3513BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3514 PROLOGUE_3_ARGS
3515 sub xSP, 20h
3516
3517 fninit
3518 fld tword [A2]
3519 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3520 %1
3521
3522 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3523 fnclex
3524 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3525 fnclex
3526 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3527
3528 fninit
3529 add xSP, 20h
3530 EPILOGUE_3_ARGS
3531ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3532%endmacro
3533
3534IEMIMPL_FPU_R80_R80 fptan
3535IEMIMPL_FPU_R80_R80 fxtract
3536IEMIMPL_FPU_R80_R80 fsincos
3537
3538
3539
3540
3541;---------------------- SSE and MMX Operations ----------------------
3542
3543;; @todo what do we need to do for MMX?
3544%macro IEMIMPL_MMX_PROLOGUE 0
3545%endmacro
3546%macro IEMIMPL_MMX_EPILOGUE 0
3547%endmacro
3548
3549;; @todo what do we need to do for SSE?
3550%macro IEMIMPL_SSE_PROLOGUE 0
3551%endmacro
3552%macro IEMIMPL_SSE_EPILOGUE 0
3553%endmacro
3554
3555;; @todo what do we need to do for AVX?
3556%macro IEMIMPL_AVX_PROLOGUE 0
3557%endmacro
3558%macro IEMIMPL_AVX_EPILOGUE 0
3559%endmacro
3560
3561
3562;;
3563; Media instruction working on two full sized registers.
3564;
3565; @param 1 The instruction
3566; @param 2 Whether there is an MMX variant (1) or not (0).
3567;
3568; @param A0 FPU context (fxsave).
3569; @param A1 Pointer to the first media register size operand (input/output).
3570; @param A2 Pointer to the second media register size operand (input).
3571;
3572%macro IEMIMPL_MEDIA_F2 2
3573%if %2 != 0
3574BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3575 PROLOGUE_3_ARGS
3576 IEMIMPL_MMX_PROLOGUE
3577
3578 movq mm0, [A1]
3579 movq mm1, [A2]
3580 %1 mm0, mm1
3581 movq [A1], mm0
3582
3583 IEMIMPL_MMX_EPILOGUE
3584 EPILOGUE_3_ARGS
3585ENDPROC iemAImpl_ %+ %1 %+ _u64
3586%endif
3587
3588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3589 PROLOGUE_3_ARGS
3590 IEMIMPL_SSE_PROLOGUE
3591
3592 movdqu xmm0, [A1]
3593 movdqu xmm1, [A2]
3594 %1 xmm0, xmm1
3595 movdqu [A1], xmm0
3596
3597 IEMIMPL_SSE_EPILOGUE
3598 EPILOGUE_3_ARGS
3599ENDPROC iemAImpl_ %+ %1 %+ _u128
3600%endmacro
3601
3602IEMIMPL_MEDIA_F2 pshufb, 1
3603IEMIMPL_MEDIA_F2 pand, 1
3604IEMIMPL_MEDIA_F2 pandn, 1
3605IEMIMPL_MEDIA_F2 por, 1
3606IEMIMPL_MEDIA_F2 pxor, 1
3607IEMIMPL_MEDIA_F2 pcmpeqb, 1
3608IEMIMPL_MEDIA_F2 pcmpeqw, 1
3609IEMIMPL_MEDIA_F2 pcmpeqd, 1
3610IEMIMPL_MEDIA_F2 pcmpeqq, 0
3611IEMIMPL_MEDIA_F2 pcmpgtb, 1
3612IEMIMPL_MEDIA_F2 pcmpgtw, 1
3613IEMIMPL_MEDIA_F2 pcmpgtd, 1
3614IEMIMPL_MEDIA_F2 pcmpgtq, 0
3615IEMIMPL_MEDIA_F2 paddb, 1
3616IEMIMPL_MEDIA_F2 paddw, 1
3617IEMIMPL_MEDIA_F2 paddd, 1
3618IEMIMPL_MEDIA_F2 paddq, 1
3619IEMIMPL_MEDIA_F2 psubb, 1
3620IEMIMPL_MEDIA_F2 psubw, 1
3621IEMIMPL_MEDIA_F2 psubd, 1
3622IEMIMPL_MEDIA_F2 psubq, 1
3623
3624
3625;;
3626; Media instruction working on one full sized and one half sized register (lower half).
3627;
3628; @param 1 The instruction
3629; @param 2 1 if MMX is included, 0 if not.
3630;
3631; @param A0 FPU context (fxsave).
3632; @param A1 Pointer to the first full sized media register operand (input/output).
3633; @param A2 Pointer to the second half sized media register operand (input).
3634;
3635%macro IEMIMPL_MEDIA_F1L1 2
3636 %if %2 != 0
3637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3638 PROLOGUE_3_ARGS
3639 IEMIMPL_MMX_PROLOGUE
3640
3641 movq mm0, [A1]
3642 movd mm1, [A2]
3643 %1 mm0, mm1
3644 movq [A1], mm0
3645
3646 IEMIMPL_MMX_EPILOGUE
3647 EPILOGUE_3_ARGS
3648ENDPROC iemAImpl_ %+ %1 %+ _u64
3649 %endif
3650
3651BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3652 PROLOGUE_3_ARGS
3653 IEMIMPL_SSE_PROLOGUE
3654
3655 movdqu xmm0, [A1]
3656 movq xmm1, [A2]
3657 %1 xmm0, xmm1
3658 movdqu [A1], xmm0
3659
3660 IEMIMPL_SSE_EPILOGUE
3661 EPILOGUE_3_ARGS
3662ENDPROC iemAImpl_ %+ %1 %+ _u128
3663%endmacro
3664
3665IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3666IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3667IEMIMPL_MEDIA_F1L1 punpckldq, 1
3668IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3669
3670
3671;;
3672; Media instruction working on one full sized and one half sized register (high half).
3673;
3674; @param 1 The instruction
3675; @param 2 1 if MMX is included, 0 if not.
3676;
3677; @param A0 FPU context (fxsave).
3678; @param A1 Pointer to the first full sized media register operand (input/output).
3679; @param A2 Pointer to the second full sized media register operand, where we
3680; will only use the upper half (input).
3681;
3682%macro IEMIMPL_MEDIA_F1H1 2
3683 %if %2 != 0
3684BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3685 PROLOGUE_3_ARGS
3686 IEMIMPL_MMX_PROLOGUE
3687
3688 movq mm0, [A1]
3689 movq mm1, [A2]
3690 %1 mm0, mm1
3691 movq [A1], mm0
3692
3693 IEMIMPL_MMX_EPILOGUE
3694 EPILOGUE_3_ARGS
3695ENDPROC iemAImpl_ %+ %1 %+ _u64
3696 %endif
3697
3698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3699 PROLOGUE_3_ARGS
3700 IEMIMPL_SSE_PROLOGUE
3701
3702 movdqu xmm0, [A1]
3703 movdqu xmm1, [A2]
3704 %1 xmm0, xmm1
3705 movdqu [A1], xmm0
3706
3707 IEMIMPL_SSE_EPILOGUE
3708 EPILOGUE_3_ARGS
3709ENDPROC iemAImpl_ %+ %1 %+ _u128
3710%endmacro
3711
3712IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3713IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3714IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3715IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3716
3717
3718;
3719; Shufflers with evil 8-bit immediates.
3720;
3721
3722BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3723 PROLOGUE_3_ARGS
3724 IEMIMPL_MMX_PROLOGUE
3725
3726 movq mm1, [A1]
3727 movq mm0, mm0 ; paranoia!
3728 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3729 lea T1, [.imm0 xWrtRIP]
3730 lea T1, [T1 + T0]
3731 call T1
3732 movq [A0], mm0
3733
3734 IEMIMPL_MMX_EPILOGUE
3735 EPILOGUE_3_ARGS
3736%assign bImm 0
3737%rep 256
3738.imm %+ bImm:
3739 pshufw mm0, mm1, bImm
3740 ret
3741 %assign bImm bImm + 1
3742%endrep
3743.immEnd: ; 256*5 == 0x500
3744dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3745dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3746ENDPROC iemAImpl_pshufw_u64
3747
3748
3749%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3751 PROLOGUE_3_ARGS
3752 IEMIMPL_SSE_PROLOGUE
3753
3754 movdqu xmm1, [A1]
3755 movdqu xmm0, xmm1 ; paranoia!
3756 lea T1, [.imm0 xWrtRIP]
3757 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3758 lea T1, [T1 + T0*2]
3759 call T1
3760 movdqu [A0], xmm0
3761
3762 IEMIMPL_SSE_EPILOGUE
3763 EPILOGUE_3_ARGS
3764 %assign bImm 0
3765 %rep 256
3766.imm %+ bImm:
3767 %1 xmm0, xmm1, bImm
3768 ret
3769 %assign bImm bImm + 1
3770 %endrep
3771.immEnd: ; 256*6 == 0x600
3772dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3773dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3774ENDPROC iemAImpl_ %+ %1 %+ _u128
3775%endmacro
3776
3777IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3778IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3779IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3780
3781
3782%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3783BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3784 PROLOGUE_3_ARGS
3785 IEMIMPL_SSE_PROLOGUE
3786
3787 vmovdqu ymm1, [A1]
3788 vmovdqu ymm0, ymm1 ; paranoia!
3789 lea T1, [.imm0 xWrtRIP]
3790 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3791 lea T1, [T1 + T0*2]
3792 call T1
3793 vmovdqu [A0], ymm0
3794
3795 IEMIMPL_SSE_EPILOGUE
3796 EPILOGUE_3_ARGS
3797 %assign bImm 0
3798 %rep 256
3799.imm %+ bImm:
3800 %1 ymm0, ymm1, bImm
3801 ret
3802 %assign bImm bImm + 1
3803 %endrep
3804.immEnd: ; 256*6 == 0x600
3805dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3806dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3807ENDPROC iemAImpl_ %+ %1 %+ _u256
3808%endmacro
3809
3810IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3811IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3812IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3813
3814
3815;
3816; Move byte mask.
3817;
3818
3819BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
3820 PROLOGUE_2_ARGS
3821 IEMIMPL_MMX_PROLOGUE
3822
3823 movq mm1, [A1]
3824 pmovmskb T0, mm1
3825 mov [A0], T0
3826%ifdef RT_ARCH_X86
3827 mov dword [A0 + 4], 0
3828%endif
3829 IEMIMPL_MMX_EPILOGUE
3830 EPILOGUE_2_ARGS
3831ENDPROC iemAImpl_pmovmskb_u64
3832
3833BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
3834 PROLOGUE_2_ARGS
3835 IEMIMPL_SSE_PROLOGUE
3836
3837 movdqu xmm1, [A1]
3838 pmovmskb T0, xmm1
3839 mov [A0], T0
3840%ifdef RT_ARCH_X86
3841 mov dword [A0 + 4], 0
3842%endif
3843 IEMIMPL_SSE_EPILOGUE
3844 EPILOGUE_2_ARGS
3845ENDPROC iemAImpl_pmovmskb_u128
3846
3847BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
3848 PROLOGUE_2_ARGS
3849 IEMIMPL_AVX_PROLOGUE
3850
3851 vmovdqu ymm1, [A1]
3852 vpmovmskb T0, ymm1
3853 mov [A0], T0
3854%ifdef RT_ARCH_X86
3855 mov dword [A0 + 4], 0
3856%endif
3857 IEMIMPL_AVX_EPILOGUE
3858 EPILOGUE_2_ARGS
3859ENDPROC iemAImpl_vpmovmskb_u256
3860
3861
3862;;
3863; Media instruction working on two full sized source registers and one destination (AVX).
3864;
3865; @param 1 The instruction
3866;
3867; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
3868; @param A1 Pointer to the destination media register size operand (output).
3869; @param A2 Pointer to the first source media register size operand (input).
3870; @param A3 Pointer to the second source media register size operand (input).
3871;
3872%macro IEMIMPL_MEDIA_F3 1
3873BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3874 PROLOGUE_4_ARGS
3875 IEMIMPL_AVX_PROLOGUE
3876
3877 vmovdqu xmm0, [A2]
3878 vmovdqu xmm1, [A3]
3879 %1 xmm0, xmm0, xmm1
3880 vmovdqu [A1], xmm0
3881
3882 IEMIMPL_AVX_PROLOGUE
3883 EPILOGUE_4_ARGS
3884ENDPROC iemAImpl_ %+ %1 %+ _u128
3885
3886BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3887 PROLOGUE_4_ARGS
3888 IEMIMPL_SSE_PROLOGUE
3889
3890 vmovdqu ymm0, [A2]
3891 vmovdqu ymm1, [A3]
3892 %1 ymm0, ymm0, ymm1
3893 vmovdqu [A1], ymm0
3894
3895 IEMIMPL_AVX_PROLOGUE
3896 EPILOGUE_4_ARGS
3897ENDPROC iemAImpl_ %+ %1 %+ _u256
3898%endmacro
3899
3900IEMIMPL_MEDIA_F3 vpshufb
3901IEMIMPL_MEDIA_F3 vpand
3902IEMIMPL_MEDIA_F3 vpandn
3903IEMIMPL_MEDIA_F3 vpor
3904IEMIMPL_MEDIA_F3 vpxor
3905IEMIMPL_MEDIA_F3 vpcmpeqb
3906IEMIMPL_MEDIA_F3 vpcmpeqw
3907IEMIMPL_MEDIA_F3 vpcmpeqd
3908IEMIMPL_MEDIA_F3 vpcmpeqq
3909IEMIMPL_MEDIA_F3 vpcmpgtb
3910IEMIMPL_MEDIA_F3 vpcmpgtw
3911IEMIMPL_MEDIA_F3 vpcmpgtd
3912IEMIMPL_MEDIA_F3 vpcmpgtq
3913IEMIMPL_MEDIA_F3 vpaddb
3914IEMIMPL_MEDIA_F3 vpaddw
3915IEMIMPL_MEDIA_F3 vpaddd
3916IEMIMPL_MEDIA_F3 vpaddq
3917IEMIMPL_MEDIA_F3 vpsubb
3918IEMIMPL_MEDIA_F3 vpsubw
3919IEMIMPL_MEDIA_F3 vpsubd
3920IEMIMPL_MEDIA_F3 vpsubq
3921
3922
3923;
3924; The SSE 4.2 crc32
3925;
3926; @param 1 The instruction
3927;
3928; @param A1 Pointer to the 32-bit destination.
3929; @param A2 The source operand, sized according to the suffix.
3930;
3931
3932BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
3933 PROLOGUE_2_ARGS
3934
3935 mov T0_32, [A0]
3936 crc32 T0_32, A1_8
3937 mov [A0], T0_32
3938
3939 EPILOGUE_2_ARGS
3940ENDPROC iemAImpl_crc32_u8
3941
3942BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
3943 PROLOGUE_2_ARGS
3944
3945 mov T0_32, [A0]
3946 crc32 T0_32, A1_16
3947 mov [A0], T0_32
3948
3949 EPILOGUE_2_ARGS
3950ENDPROC iemAImpl_crc32_u16
3951
3952BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
3953 PROLOGUE_2_ARGS
3954
3955 mov T0_32, [A0]
3956 crc32 T0_32, A1_32
3957 mov [A0], T0_32
3958
3959 EPILOGUE_2_ARGS
3960ENDPROC iemAImpl_crc32_u32
3961
3962%ifdef RT_ARCH_AMD64
3963BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
3964 PROLOGUE_2_ARGS
3965
3966 mov T0_32, [A0]
3967 crc32 T0, A1
3968 mov [A0], T0_32
3969
3970 EPILOGUE_2_ARGS
3971ENDPROC iemAImpl_crc32_u64
3972%endif
3973
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette