VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 96945

Last change on this file since 96945 was 96945, checked in by vboxsync, 2 years ago

IEM: Assembly implementation of AES-NI instructions, WIP.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 174.0 KB
Line 
1; $Id: IEMAllAImpl.asm 96945 2022-09-30 06:51:12Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Update the flag.
305;
306; @remarks Clobbers T0, T1, stack.
307; @param 1 The register pointing to the EFLAGS.
308; @param 2 The mask of modified flags to save.
309; @param 3 The mask of undefined flags to (maybe) save.
310;
311%macro IEM_SAVE_FLAGS 3
312 %if (%2 | %3) != 0
313 pushf
314 pop T1
315 mov T0_32, [%1] ; flags
316 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
317 and T1_32, (%2 | %3) ; select the modified and undefined flags.
318 or T0_32, T1_32 ; combine the flags.
319 mov [%1], T0_32 ; save the flags.
320 %endif
321%endmacro
322
323;;
324; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
325;
326; @remarks Clobbers T0, T1, stack.
327; @param 1 The register pointing to the EFLAGS.
328; @param 2 The mask of modified flags to save.
329; @param 3 Mask of additional flags to always clear
330; @param 4 Mask of additional flags to always set.
331;
332%macro IEM_SAVE_AND_ADJUST_FLAGS 4
333 %if (%2 | %3 | %4) != 0
334 pushf
335 pop T1
336 mov T0_32, [%1] ; load flags.
337 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
338 and T1_32, (%2) ; select the modified flags.
339 or T0_32, T1_32 ; combine the flags.
340 %if (%4) != 0
341 or T0_32, %4 ; add the always set flags.
342 %endif
343 mov [%1], T0_32 ; save the result.
344 %endif
345%endmacro
346
347;;
348; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
349; signed input (%4[%5]) and parity index (%6).
350;
351; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
352; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
353; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
354;
355; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
356; @param 1 The register pointing to the EFLAGS.
357; @param 2 The mask of modified flags to save.
358; @param 3 Mask of additional flags to always clear
359; @param 4 The result register to set SF by.
360; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
361; @param 6 The (full) register containing the parity table index. Will be modified!
362
363%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
364 %ifdef RT_ARCH_AMD64
365 pushf
366 pop T2
367 %else
368 push T0
369 pushf
370 pop T0
371 %endif
372 mov T1_32, [%1] ; load flags.
373 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
374 %ifdef RT_ARCH_AMD64
375 and T2_32, (%2) ; select the modified flags.
376 or T1_32, T2_32 ; combine the flags.
377 %else
378 and T0_32, (%2) ; select the modified flags.
379 or T1_32, T0_32 ; combine the flags.
380 pop T0
381 %endif
382
383 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
384 bt %4, %5 - 1
385 jnc %%sf_clear
386 or T1_32, X86_EFL_SF
387 %%sf_clear:
388
389 ; Parity last.
390 and %6, 0xff
391 %ifdef RT_ARCH_AMD64
392 lea T2, [NAME(g_afParity) xWrtRIP]
393 or T1_8, [T2 + %6]
394 %else
395 or T1_8, [NAME(g_afParity) + %6]
396 %endif
397
398 mov [%1], T1_32 ; save the result.
399%endmacro
400
401;;
402; Calculates the new EFLAGS using fixed clear and set bit masks.
403;
404; @remarks Clobbers T0.
405; @param 1 The register pointing to the EFLAGS.
406; @param 2 Mask of additional flags to always clear
407; @param 3 Mask of additional flags to always set.
408;
409%macro IEM_ADJUST_FLAGS 3
410 %if (%2 | %3) != 0
411 mov T0_32, [%1] ; Load flags.
412 %if (%2) != 0
413 and T0_32, ~(%2) ; Remove the always cleared flags.
414 %endif
415 %if (%3) != 0
416 or T0_32, %3 ; Add the always set flags.
417 %endif
418 mov [%1], T0_32 ; Save the result.
419 %endif
420%endmacro
421
422;;
423; Calculates the new EFLAGS using fixed clear and set bit masks.
424;
425; @remarks Clobbers T0, %4, EFLAGS.
426; @param 1 The register pointing to the EFLAGS.
427; @param 2 Mask of additional flags to always clear
428; @param 3 Mask of additional flags to always set.
429; @param 4 The (full) register containing the parity table index. Will be modified!
430;
431%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
432 mov T0_32, [%1] ; Load flags.
433 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 and %4, 0xff
438 %ifdef RT_ARCH_AMD64
439 lea T2, [NAME(g_afParity) xWrtRIP]
440 or T0_8, [T2 + %4]
441 %else
442 or T0_8, [NAME(g_afParity) + %4]
443 %endif
444 mov [%1], T0_32 ; Save the result.
445%endmacro
446
447
448;*********************************************************************************************************************************
449;* External Symbols *
450;*********************************************************************************************************************************
451extern NAME(g_afParity)
452
453
454;;
455; Macro for implementing a binary operator.
456;
457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
458; variants, except on 32-bit system where the 64-bit accesses requires hand
459; coding.
460;
461; All the functions takes a pointer to the destination memory operand in A0,
462; the source register operand in A1 and a pointer to eflags in A2.
463;
464; @param 1 The instruction mnemonic.
465; @param 2 Non-zero if there should be a locked version.
466; @param 3 The modified flags.
467; @param 4 The undefined flags.
468;
469%macro IEMIMPL_BIN_OP 4
470BEGINCODE
471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
472 PROLOGUE_3_ARGS
473 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474 %1 byte [A0], A1_8
475 IEM_SAVE_FLAGS A2, %3, %4
476 EPILOGUE_3_ARGS
477ENDPROC iemAImpl_ %+ %1 %+ _u8
478
479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
480 PROLOGUE_3_ARGS
481 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
482 %1 word [A0], A1_16
483 IEM_SAVE_FLAGS A2, %3, %4
484 EPILOGUE_3_ARGS
485ENDPROC iemAImpl_ %+ %1 %+ _u16
486
487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
488 PROLOGUE_3_ARGS
489 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
490 %1 dword [A0], A1_32
491 IEM_SAVE_FLAGS A2, %3, %4
492 EPILOGUE_3_ARGS
493ENDPROC iemAImpl_ %+ %1 %+ _u32
494
495 %ifdef RT_ARCH_AMD64
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 qword [A0], A1
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS_EX 8
502ENDPROC iemAImpl_ %+ %1 %+ _u64
503 %endif ; RT_ARCH_AMD64
504
505 %if %2 != 0 ; locked versions requested?
506
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 lock %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 lock %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 lock %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
539 %endif ; RT_ARCH_AMD64
540 %endif ; locked
541%endmacro
542
543; instr,lock, modified-flags, undefined flags
544IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
545IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
546IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
547IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
548IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
549IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
550IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
551IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
552IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
553
554
555;;
556; Macro for implementing a binary operator, VEX variant with separate input/output.
557;
558; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
559; where the 64-bit accesses requires hand coding.
560;
561; All the functions takes a pointer to the destination memory operand in A0,
562; the first source register operand in A1, the second source register operand
563; in A2 and a pointer to eflags in A3.
564;
565; @param 1 The instruction mnemonic.
566; @param 2 The modified flags.
567; @param 3 The undefined flags.
568;
569%macro IEMIMPL_VEX_BIN_OP 3
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0_32, A1_32, A2_32
574 mov [A0], T0_32
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u32
578
579 %ifdef RT_ARCH_AMD64
580BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
581 PROLOGUE_4_ARGS
582 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
583 %1 T0, A1, A2
584 mov [A0], T0
585 IEM_SAVE_FLAGS A3, %2, %3
586 EPILOGUE_4_ARGS
587ENDPROC iemAImpl_ %+ %1 %+ _u64
588 %endif ; RT_ARCH_AMD64
589%endmacro
590
591; instr, modified-flags, undefined-flags
592IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
593IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
594IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
595
596;;
597; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
598;
599; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
600; where the 64-bit accesses requires hand coding.
601;
602; All the functions takes a pointer to the destination memory operand in A0,
603; the source register operand in A1 and a pointer to eflags in A2.
604;
605; @param 1 The instruction mnemonic.
606; @param 2 The modified flags.
607; @param 3 The undefined flags.
608;
609%macro IEMIMPL_VEX_BIN_OP_2 3
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0_32, [A0]
614 %1 T0_32, A1_32
615 mov [A0], T0_32
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_4_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
624 mov T0, [A0]
625 %1 T0, A1
626 mov [A0], T0
627 IEM_SAVE_FLAGS A2, %2, %3
628 EPILOGUE_4_ARGS
629ENDPROC iemAImpl_ %+ %1 %+ _u64
630 %endif ; RT_ARCH_AMD64
631%endmacro
632
633; instr, modified-flags, undefined-flags
634IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
635IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
636IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
637
638
639;;
640; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
641;
642; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
643; where the 64-bit accesses requires hand coding.
644;
645; All the functions takes a pointer to the destination memory operand in A0,
646; the first source register operand in A1, the second source register operand
647; in A2 and a pointer to eflags in A3.
648;
649; @param 1 The instruction mnemonic.
650; @param 2 Fallback instruction if applicable.
651; @param 3 Whether to emit fallback or not.
652;
653%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
655 PROLOGUE_3_ARGS
656 %1 T0_32, A1_32, A2_32
657 mov [A0], T0_32
658 EPILOGUE_3_ARGS
659ENDPROC iemAImpl_ %+ %1 %+ _u32
660
661 %if %3
662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
663 PROLOGUE_3_ARGS
664 %ifdef ASM_CALL64_GCC
665 mov cl, A2_8
666 %2 A1_32, cl
667 mov [A0], A1_32
668 %else
669 xchg A2, A0
670 %2 A1_32, cl
671 mov [A2], A1_32
672 %endif
673 EPILOGUE_3_ARGS
674ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
675 %endif
676
677 %ifdef RT_ARCH_AMD64
678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
679 PROLOGUE_3_ARGS
680 %1 T0, A1, A2
681 mov [A0], T0
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64
684
685 %if %3
686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
687 PROLOGUE_3_ARGS
688 %ifdef ASM_CALL64_GCC
689 mov cl, A2_8
690 %2 A1, cl
691 mov [A0], A1_32
692 %else
693 xchg A2, A0
694 %2 A1, cl
695 mov [A2], A1_32
696 %endif
697 mov [A0], A1
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
700 %endif
701 %endif ; RT_ARCH_AMD64
702%endmacro
703
704; instr, fallback instr, emit fallback
705IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
706IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
707IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
708IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
709IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
710
711
712;
713; RORX uses a immediate byte for the shift count, so we only do
714; fallback implementation of that one.
715;
716BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
717 PROLOGUE_3_ARGS
718 %ifdef ASM_CALL64_GCC
719 mov cl, A2_8
720 ror A1_32, cl
721 mov [A0], A1_32
722 %else
723 xchg A2, A0
724 ror A1_32, cl
725 mov [A2], A1_32
726 %endif
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_rorx_u32
729
730 %ifdef RT_ARCH_AMD64
731BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
732 PROLOGUE_3_ARGS
733 %ifdef ASM_CALL64_GCC
734 mov cl, A2_8
735 ror A1, cl
736 mov [A0], A1_32
737 %else
738 xchg A2, A0
739 ror A1, cl
740 mov [A2], A1_32
741 %endif
742 mov [A0], A1
743 EPILOGUE_3_ARGS
744ENDPROC iemAImpl_rorx_u64
745 %endif ; RT_ARCH_AMD64
746
747
748;
749; MULX
750;
751BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
752 PROLOGUE_4_ARGS
753%ifdef ASM_CALL64_GCC
754 ; A2_32 is EDX - prefect
755 mulx T0_32, T1_32, A3_32
756 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
757 mov [A0], T0_32
758%else
759 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
760 xchg A1, A2
761 mulx T0_32, T1_32, A3_32
762 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
763 mov [A0], T0_32
764%endif
765 EPILOGUE_4_ARGS
766ENDPROC iemAImpl_mulx_u32
767
768
769BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX, T0_32 is EAX
773 mov eax, A3_32
774 mul A2_32
775 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
776 mov [A0], edx
777%else
778 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
779 xchg A1, A2
780 mov eax, A3_32
781 mul A2_32
782 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], edx
784%endif
785 EPILOGUE_4_ARGS
786ENDPROC iemAImpl_mulx_u32_fallback
787
788%ifdef RT_ARCH_AMD64
789BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
790 PROLOGUE_4_ARGS
791%ifdef ASM_CALL64_GCC
792 ; A2 is RDX - prefect
793 mulx T0, T1, A3
794 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
795 mov [A0], T0
796%else
797 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
798 xchg A1, A2
799 mulx T0, T1, A3
800 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], T0
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u64
805
806
807BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX, T0 is RAX
811 mov rax, A3
812 mul A2
813 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
814 mov [A0], rdx
815%else
816 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
817 xchg A1, A2
818 mov rax, A3
819 mul A2
820 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], rdx
822%endif
823 EPILOGUE_4_ARGS
824ENDPROC iemAImpl_mulx_u64_fallback
825
826%endif
827
828
829;;
830; Macro for implementing a bit operator.
831;
832; This will generate code for the 16, 32 and 64 bit accesses with locked
833; variants, except on 32-bit system where the 64-bit accesses requires hand
834; coding.
835;
836; All the functions takes a pointer to the destination memory operand in A0,
837; the source register operand in A1 and a pointer to eflags in A2.
838;
839; @param 1 The instruction mnemonic.
840; @param 2 Non-zero if there should be a locked version.
841; @param 3 The modified flags.
842; @param 4 The undefined flags.
843;
844%macro IEMIMPL_BIT_OP 4
845BEGINCODE
846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
847 PROLOGUE_3_ARGS
848 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
849 %1 word [A0], A1_16
850 IEM_SAVE_FLAGS A2, %3, %4
851 EPILOGUE_3_ARGS
852ENDPROC iemAImpl_ %+ %1 %+ _u16
853
854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
855 PROLOGUE_3_ARGS
856 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
857 %1 dword [A0], A1_32
858 IEM_SAVE_FLAGS A2, %3, %4
859 EPILOGUE_3_ARGS
860ENDPROC iemAImpl_ %+ %1 %+ _u32
861
862 %ifdef RT_ARCH_AMD64
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 qword [A0], A1
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS_EX 8
869ENDPROC iemAImpl_ %+ %1 %+ _u64
870 %endif ; RT_ARCH_AMD64
871
872 %if %2 != 0 ; locked versions requested?
873
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 lock %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 lock %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
898 %endif ; RT_ARCH_AMD64
899 %endif ; locked
900%endmacro
901IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
902IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
903IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
904IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
905
906;;
907; Macro for implementing a bit search operator.
908;
909; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
910; system where the 64-bit accesses requires hand coding.
911;
912; All the functions takes a pointer to the destination memory operand in A0,
913; the source register operand in A1 and a pointer to eflags in A2.
914;
915; In the ZF case the destination register is 'undefined', however it seems that
916; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
917; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
918; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
919; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
920;
921; @param 1 The instruction mnemonic.
922; @param 2 The modified flags.
923; @param 3 The undefined flags.
924; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
925;
926%macro IEMIMPL_BIT_OP2 4
927BEGINCODE
928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
931 %1 T0_16, A1_16
932%if %4 != 0
933 jz .unchanged_dst
934%endif
935 mov [A0], T0_16
936.unchanged_dst:
937 IEM_SAVE_FLAGS A2, %2, %3
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_ %+ %1 %+ _u16
940
941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
942 PROLOGUE_3_ARGS
943 %1 T1_16, A1_16
944%if %4 != 0
945 jz .unchanged_dst
946%endif
947 mov [A0], T1_16
948 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
949 EPILOGUE_3_ARGS
950.unchanged_dst:
951 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
952 EPILOGUE_3_ARGS
953ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
954
955BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
956 PROLOGUE_3_ARGS
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
966
967
968BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
969 PROLOGUE_3_ARGS
970 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
971 %1 T0_32, A1_32
972%if %4 != 0
973 jz .unchanged_dst
974%endif
975 mov [A0], T0_32
976.unchanged_dst:
977 IEM_SAVE_FLAGS A2, %2, %3
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u32
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
982 PROLOGUE_3_ARGS
983 %1 T1_32, A1_32
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T1_32
988 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
989 EPILOGUE_3_ARGS
990.unchanged_dst:
991 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
992 EPILOGUE_3_ARGS
993ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
994
995BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
996 PROLOGUE_3_ARGS
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1006
1007
1008 %ifdef RT_ARCH_AMD64
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1011 PROLOGUE_3_ARGS
1012 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1013 %1 T0, A1
1014%if %4 != 0
1015 jz .unchanged_dst
1016%endif
1017 mov [A0], T0
1018.unchanged_dst:
1019 IEM_SAVE_FLAGS A2, %2, %3
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64
1022
1023BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1024 PROLOGUE_3_ARGS
1025 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1026 %1 T1, A1
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T1
1031 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1032 EPILOGUE_3_ARGS
1033.unchanged_dst:
1034 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1035 EPILOGUE_3_ARGS
1036ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1037
1038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1039 PROLOGUE_3_ARGS
1040 %1 T0, A1
1041%if %4 != 0
1042 jz .unchanged_dst
1043%endif
1044 mov [A0], T0
1045.unchanged_dst:
1046 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1047 EPILOGUE_3_ARGS_EX 8
1048ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1049
1050 %endif ; RT_ARCH_AMD64
1051%endmacro
1052
1053IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1054IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1055IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1056IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1057
1058
1059;;
1060; Macro for implementing POPCNT.
1061;
1062; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1063; system where the 64-bit accesses requires hand coding.
1064;
1065; All the functions takes a pointer to the destination memory operand in A0,
1066; the source register operand in A1 and a pointer to eflags in A2.
1067;
1068; ASSUMES Intel and AMD set EFLAGS the same way.
1069;
1070; ASSUMES the instruction does not support memory destination.
1071;
1072; @param 1 The instruction mnemonic.
1073; @param 2 The modified flags.
1074; @param 3 The undefined flags.
1075;
1076%macro IEMIMPL_BIT_OP3 3
1077BEGINCODE
1078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1079 PROLOGUE_3_ARGS
1080 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1081 %1 T0_16, A1_16
1082 mov [A0], T0_16
1083 IEM_SAVE_FLAGS A2, %2, %3
1084 EPILOGUE_3_ARGS
1085ENDPROC iemAImpl_ %+ %1 %+ _u16
1086
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0_32, A1_32
1091 mov [A0], T0_32
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS
1094ENDPROC iemAImpl_ %+ %1 %+ _u32
1095
1096 %ifdef RT_ARCH_AMD64
1097BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1098 PROLOGUE_3_ARGS
1099 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1100 %1 T0, A1
1101 mov [A0], T0
1102 IEM_SAVE_FLAGS A2, %2, %3
1103 EPILOGUE_3_ARGS_EX 8
1104ENDPROC iemAImpl_ %+ %1 %+ _u64
1105 %endif ; RT_ARCH_AMD64
1106%endmacro
1107IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1108
1109
1110;
1111; IMUL is also a similar but yet different case (no lock, no mem dst).
1112; The rDX:rAX variant of imul is handled together with mul further down.
1113;
1114BEGINCODE
1115; @param 1 EFLAGS that are modified.
1116; @param 2 Undefined EFLAGS.
1117; @param 3 Function suffix.
1118; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1119; 2 for AMD (set AF, clear PF, ZF and SF).
1120%macro IEMIMPL_IMUL_TWO 4
1121BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1124 imul A1_16, word [A0]
1125 mov [A0], A1_16
1126 %if %4 != 1
1127 IEM_SAVE_FLAGS A2, %1, %2
1128 %else
1129 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1130 %endif
1131 EPILOGUE_3_ARGS
1132ENDPROC iemAImpl_imul_two_u16 %+ %3
1133
1134BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1135 PROLOGUE_3_ARGS
1136 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1137 imul A1_32, dword [A0]
1138 mov [A0], A1_32
1139 %if %4 != 1
1140 IEM_SAVE_FLAGS A2, %1, %2
1141 %else
1142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1143 %endif
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_imul_two_u32 %+ %3
1146
1147 %ifdef RT_ARCH_AMD64
1148BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1151 imul A1, qword [A0]
1152 mov [A0], A1
1153 %if %4 != 1
1154 IEM_SAVE_FLAGS A2, %1, %2
1155 %else
1156 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1157 %endif
1158 EPILOGUE_3_ARGS_EX 8
1159ENDPROC iemAImpl_imul_two_u64 %+ %3
1160 %endif ; RT_ARCH_AMD64
1161%endmacro
1162IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1163IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1164IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1165
1166
1167;
1168; XCHG for memory operands. This implies locking. No flag changes.
1169;
1170; Each function takes two arguments, first the pointer to the memory,
1171; then the pointer to the register. They all return void.
1172;
1173BEGINCODE
1174BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1175 PROLOGUE_2_ARGS
1176 mov T0_8, [A1]
1177 xchg [A0], T0_8
1178 mov [A1], T0_8
1179 EPILOGUE_2_ARGS
1180ENDPROC iemAImpl_xchg_u8_locked
1181
1182BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1183 PROLOGUE_2_ARGS
1184 mov T0_16, [A1]
1185 xchg [A0], T0_16
1186 mov [A1], T0_16
1187 EPILOGUE_2_ARGS
1188ENDPROC iemAImpl_xchg_u16_locked
1189
1190BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1191 PROLOGUE_2_ARGS
1192 mov T0_32, [A1]
1193 xchg [A0], T0_32
1194 mov [A1], T0_32
1195 EPILOGUE_2_ARGS
1196ENDPROC iemAImpl_xchg_u32_locked
1197
1198%ifdef RT_ARCH_AMD64
1199BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0, [A1]
1202 xchg [A0], T0
1203 mov [A1], T0
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u64_locked
1206%endif
1207
1208; Unlocked variants for fDisregardLock mode.
1209
1210BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1211 PROLOGUE_2_ARGS
1212 mov T0_8, [A1]
1213 mov T1_8, [A0]
1214 mov [A0], T0_8
1215 mov [A1], T1_8
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u8_unlocked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_16, [A1]
1222 mov T1_16, [A0]
1223 mov [A0], T0_16
1224 mov [A1], T1_16
1225 EPILOGUE_2_ARGS
1226ENDPROC iemAImpl_xchg_u16_unlocked
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_32, [A1]
1231 mov T1_32, [A0]
1232 mov [A0], T0_32
1233 mov [A1], T1_32
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u32_unlocked
1236
1237%ifdef RT_ARCH_AMD64
1238BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1239 PROLOGUE_2_ARGS
1240 mov T0, [A1]
1241 mov T1, [A0]
1242 mov [A0], T0
1243 mov [A1], T1
1244 EPILOGUE_2_ARGS
1245ENDPROC iemAImpl_xchg_u64_unlocked
1246%endif
1247
1248
1249;
1250; XADD for memory operands.
1251;
1252; Each function takes three arguments, first the pointer to the
1253; memory/register, then the pointer to the register, and finally a pointer to
1254; eflags. They all return void.
1255;
1256BEGINCODE
1257BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_8, [A1]
1261 xadd [A0], T0_8
1262 mov [A1], T0_8
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u8
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_16, [A1]
1271 xadd [A0], T0_16
1272 mov [A1], T0_16
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u16
1276
1277BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1278 PROLOGUE_3_ARGS
1279 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1280 mov T0_32, [A1]
1281 xadd [A0], T0_32
1282 mov [A1], T0_32
1283 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 EPILOGUE_3_ARGS
1285ENDPROC iemAImpl_xadd_u32
1286
1287%ifdef RT_ARCH_AMD64
1288BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1289 PROLOGUE_3_ARGS
1290 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 mov T0, [A1]
1292 xadd [A0], T0
1293 mov [A1], T0
1294 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 EPILOGUE_3_ARGS
1296ENDPROC iemAImpl_xadd_u64
1297%endif ; RT_ARCH_AMD64
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_8, [A1]
1303 lock xadd [A0], T0_8
1304 mov [A1], T0_8
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u8_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_16, [A1]
1313 lock xadd [A0], T0_16
1314 mov [A1], T0_16
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u16_locked
1318
1319BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1320 PROLOGUE_3_ARGS
1321 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1322 mov T0_32, [A1]
1323 lock xadd [A0], T0_32
1324 mov [A1], T0_32
1325 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 EPILOGUE_3_ARGS
1327ENDPROC iemAImpl_xadd_u32_locked
1328
1329%ifdef RT_ARCH_AMD64
1330BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 mov T0, [A1]
1334 lock xadd [A0], T0
1335 mov [A1], T0
1336 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 EPILOGUE_3_ARGS
1338ENDPROC iemAImpl_xadd_u64_locked
1339%endif ; RT_ARCH_AMD64
1340
1341
1342;
1343; CMPXCHG8B.
1344;
1345; These are tricky register wise, so the code is duplicated for each calling
1346; convention.
1347;
1348; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1349;
1350; C-proto:
1351; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1352; uint32_t *pEFlags));
1353;
1354; Note! Identical to iemAImpl_cmpxchg16b.
1355;
1356BEGINCODE
1357BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1358%ifdef RT_ARCH_AMD64
1359 %ifdef ASM_CALL64_MSC
1360 push rbx
1361
1362 mov r11, rdx ; pu64EaxEdx (is also T1)
1363 mov r10, rcx ; pu64Dst
1364
1365 mov ebx, [r8]
1366 mov ecx, [r8 + 4]
1367 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1368 mov eax, [r11]
1369 mov edx, [r11 + 4]
1370
1371 lock cmpxchg8b [r10]
1372
1373 mov [r11], eax
1374 mov [r11 + 4], edx
1375 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1376
1377 pop rbx
1378 ret
1379 %else
1380 push rbx
1381
1382 mov r10, rcx ; pEFlags
1383 mov r11, rdx ; pu64EbxEcx (is also T1)
1384
1385 mov ebx, [r11]
1386 mov ecx, [r11 + 4]
1387 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1388 mov eax, [rsi]
1389 mov edx, [rsi + 4]
1390
1391 lock cmpxchg8b [rdi]
1392
1393 mov [rsi], eax
1394 mov [rsi + 4], edx
1395 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1396
1397 pop rbx
1398 ret
1399
1400 %endif
1401%else
1402 push esi
1403 push edi
1404 push ebx
1405 push ebp
1406
1407 mov edi, ecx ; pu64Dst
1408 mov esi, edx ; pu64EaxEdx
1409 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1410 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1411
1412 mov ebx, [ecx]
1413 mov ecx, [ecx + 4]
1414 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1415 mov eax, [esi]
1416 mov edx, [esi + 4]
1417
1418 lock cmpxchg8b [edi]
1419
1420 mov [esi], eax
1421 mov [esi + 4], edx
1422 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1423
1424 pop ebp
1425 pop ebx
1426 pop edi
1427 pop esi
1428 ret 8
1429%endif
1430ENDPROC iemAImpl_cmpxchg8b
1431
1432BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1433 ; Lazy bird always lock prefixes cmpxchg8b.
1434 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1435ENDPROC iemAImpl_cmpxchg8b_locked
1436
1437%ifdef RT_ARCH_AMD64
1438
1439;
1440; CMPXCHG16B.
1441;
1442; These are tricky register wise, so the code is duplicated for each calling
1443; convention.
1444;
1445; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1446;
1447; C-proto:
1448; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1449; uint32_t *pEFlags));
1450;
1451; Note! Identical to iemAImpl_cmpxchg8b.
1452;
1453BEGINCODE
1454BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1455 %ifdef ASM_CALL64_MSC
1456 push rbx
1457
1458 mov r11, rdx ; pu64RaxRdx (is also T1)
1459 mov r10, rcx ; pu64Dst
1460
1461 mov rbx, [r8]
1462 mov rcx, [r8 + 8]
1463 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1464 mov rax, [r11]
1465 mov rdx, [r11 + 8]
1466
1467 lock cmpxchg16b [r10]
1468
1469 mov [r11], rax
1470 mov [r11 + 8], rdx
1471 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1472
1473 pop rbx
1474 ret
1475 %else
1476 push rbx
1477
1478 mov r10, rcx ; pEFlags
1479 mov r11, rdx ; pu64RbxRcx (is also T1)
1480
1481 mov rbx, [r11]
1482 mov rcx, [r11 + 8]
1483 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1484 mov rax, [rsi]
1485 mov rdx, [rsi + 8]
1486
1487 lock cmpxchg16b [rdi]
1488
1489 mov [rsi], rax
1490 mov [rsi + 8], rdx
1491 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1492
1493 pop rbx
1494 ret
1495
1496 %endif
1497ENDPROC iemAImpl_cmpxchg16b
1498
1499BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1500 ; Lazy bird always lock prefixes cmpxchg16b.
1501 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1502ENDPROC iemAImpl_cmpxchg16b_locked
1503
1504%endif ; RT_ARCH_AMD64
1505
1506
1507;
1508; CMPXCHG.
1509;
1510; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1511;
1512; C-proto:
1513; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1514;
1515BEGINCODE
1516%macro IEMIMPL_CMPXCHG 2
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov al, [A1]
1521 %1 cmpxchg [A0], A2_8
1522 mov [A1], al
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov ax, [A1]
1531 %1 cmpxchg [A0], A2_16
1532 mov [A1], ax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1538 PROLOGUE_4_ARGS
1539 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1540 mov eax, [A1]
1541 %1 cmpxchg [A0], A2_32
1542 mov [A1], eax
1543 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1544 EPILOGUE_4_ARGS
1545ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1546
1547BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1548%ifdef RT_ARCH_AMD64
1549 PROLOGUE_4_ARGS
1550 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1551 mov rax, [A1]
1552 %1 cmpxchg [A0], A2
1553 mov [A1], rax
1554 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1555 EPILOGUE_4_ARGS
1556%else
1557 ;
1558 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1559 ;
1560 push esi
1561 push edi
1562 push ebx
1563 push ebp
1564
1565 mov edi, ecx ; pu64Dst
1566 mov esi, edx ; pu64Rax
1567 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1568 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1569
1570 mov ebx, [ecx]
1571 mov ecx, [ecx + 4]
1572 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1573 mov eax, [esi]
1574 mov edx, [esi + 4]
1575
1576 lock cmpxchg8b [edi]
1577
1578 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1579 jz .cmpxchg8b_not_equal
1580 cmp eax, eax ; just set the other flags.
1581.store:
1582 mov [esi], eax
1583 mov [esi + 4], edx
1584 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1585
1586 pop ebp
1587 pop ebx
1588 pop edi
1589 pop esi
1590 ret 8
1591
1592.cmpxchg8b_not_equal:
1593 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1594 jne .store
1595 cmp [esi], eax
1596 jmp .store
1597
1598%endif
1599ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1600%endmacro ; IEMIMPL_CMPXCHG
1601
1602IEMIMPL_CMPXCHG , ,
1603IEMIMPL_CMPXCHG lock, _locked
1604
1605;;
1606; Macro for implementing a unary operator.
1607;
1608; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1609; variants, except on 32-bit system where the 64-bit accesses requires hand
1610; coding.
1611;
1612; All the functions takes a pointer to the destination memory operand in A0,
1613; the source register operand in A1 and a pointer to eflags in A2.
1614;
1615; @param 1 The instruction mnemonic.
1616; @param 2 The modified flags.
1617; @param 3 The undefined flags.
1618;
1619%macro IEMIMPL_UNARY_OP 3
1620BEGINCODE
1621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1622 PROLOGUE_2_ARGS
1623 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1624 %1 byte [A0]
1625 IEM_SAVE_FLAGS A1, %2, %3
1626 EPILOGUE_2_ARGS
1627ENDPROC iemAImpl_ %+ %1 %+ _u8
1628
1629BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1630 PROLOGUE_2_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1632 lock %1 byte [A0]
1633 IEM_SAVE_FLAGS A1, %2, %3
1634 EPILOGUE_2_ARGS
1635ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1636
1637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1638 PROLOGUE_2_ARGS
1639 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1640 %1 word [A0]
1641 IEM_SAVE_FLAGS A1, %2, %3
1642 EPILOGUE_2_ARGS
1643ENDPROC iemAImpl_ %+ %1 %+ _u16
1644
1645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1646 PROLOGUE_2_ARGS
1647 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1648 lock %1 word [A0]
1649 IEM_SAVE_FLAGS A1, %2, %3
1650 EPILOGUE_2_ARGS
1651ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1652
1653BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1654 PROLOGUE_2_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1656 %1 dword [A0]
1657 IEM_SAVE_FLAGS A1, %2, %3
1658 EPILOGUE_2_ARGS
1659ENDPROC iemAImpl_ %+ %1 %+ _u32
1660
1661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1662 PROLOGUE_2_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1664 lock %1 dword [A0]
1665 IEM_SAVE_FLAGS A1, %2, %3
1666 EPILOGUE_2_ARGS
1667ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1668
1669 %ifdef RT_ARCH_AMD64
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 qword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u64
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 qword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1685 %endif ; RT_ARCH_AMD64
1686
1687%endmacro
1688
1689IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1690IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1691IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1692IEMIMPL_UNARY_OP not, 0, 0
1693
1694
1695;
1696; BSWAP. No flag changes.
1697;
1698; Each function takes one argument, pointer to the value to bswap
1699; (input/output). They all return void.
1700;
1701BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1702 PROLOGUE_1_ARGS
1703 mov T0_32, [A0] ; just in case any of the upper bits are used.
1704 db 66h
1705 bswap T0_32
1706 mov [A0], T0_32
1707 EPILOGUE_1_ARGS
1708ENDPROC iemAImpl_bswap_u16
1709
1710BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1711 PROLOGUE_1_ARGS
1712 mov T0_32, [A0]
1713 bswap T0_32
1714 mov [A0], T0_32
1715 EPILOGUE_1_ARGS
1716ENDPROC iemAImpl_bswap_u32
1717
1718BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1719%ifdef RT_ARCH_AMD64
1720 PROLOGUE_1_ARGS
1721 mov T0, [A0]
1722 bswap T0
1723 mov [A0], T0
1724 EPILOGUE_1_ARGS
1725%else
1726 PROLOGUE_1_ARGS
1727 mov T0, [A0]
1728 mov T1, [A0 + 4]
1729 bswap T0
1730 bswap T1
1731 mov [A0 + 4], T0
1732 mov [A0], T1
1733 EPILOGUE_1_ARGS
1734%endif
1735ENDPROC iemAImpl_bswap_u64
1736
1737
1738;;
1739; Macro for implementing a shift operation.
1740;
1741; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1742; 32-bit system where the 64-bit accesses requires hand coding.
1743;
1744; All the functions takes a pointer to the destination memory operand in A0,
1745; the shift count in A1 and a pointer to eflags in A2.
1746;
1747; @param 1 The instruction mnemonic.
1748; @param 2 The modified flags.
1749; @param 3 The undefined flags.
1750;
1751; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1752;
1753; @note the _intel and _amd variants are implemented in C.
1754;
1755%macro IEMIMPL_SHIFT_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1758 PROLOGUE_3_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1760 %ifdef ASM_CALL64_GCC
1761 mov cl, A1_8
1762 %1 byte [A0], cl
1763 %else
1764 xchg A1, A0
1765 %1 byte [A1], cl
1766 %endif
1767 IEM_SAVE_FLAGS A2, %2, %3
1768 EPILOGUE_3_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1772 PROLOGUE_3_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1774 %ifdef ASM_CALL64_GCC
1775 mov cl, A1_8
1776 %1 word [A0], cl
1777 %else
1778 xchg A1, A0
1779 %1 word [A1], cl
1780 %endif
1781 IEM_SAVE_FLAGS A2, %2, %3
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1788 %ifdef ASM_CALL64_GCC
1789 mov cl, A1_8
1790 %1 dword [A0], cl
1791 %else
1792 xchg A1, A0
1793 %1 dword [A1], cl
1794 %endif
1795 IEM_SAVE_FLAGS A2, %2, %3
1796 EPILOGUE_3_ARGS
1797ENDPROC iemAImpl_ %+ %1 %+ _u32
1798
1799 %ifdef RT_ARCH_AMD64
1800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1801 PROLOGUE_3_ARGS
1802 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1803 %ifdef ASM_CALL64_GCC
1804 mov cl, A1_8
1805 %1 qword [A0], cl
1806 %else
1807 xchg A1, A0
1808 %1 qword [A1], cl
1809 %endif
1810 IEM_SAVE_FLAGS A2, %2, %3
1811 EPILOGUE_3_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813 %endif ; RT_ARCH_AMD64
1814
1815%endmacro
1816
1817IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1818IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1819IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1820IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1821IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1822IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1823IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1824
1825
1826;;
1827; Macro for implementing a double precision shift operation.
1828;
1829; This will generate code for the 16, 32 and 64 bit accesses, except on
1830; 32-bit system where the 64-bit accesses requires hand coding.
1831;
1832; The functions takes the destination operand (r/m) in A0, the source (reg) in
1833; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1834;
1835; @param 1 The instruction mnemonic.
1836; @param 2 The modified flags.
1837; @param 3 The undefined flags.
1838;
1839; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1840;
1841; @note the _intel and _amd variants are implemented in C.
1842;
1843%macro IEMIMPL_SHIFT_DBL_OP 3
1844BEGINCODE
1845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1846 PROLOGUE_4_ARGS
1847 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1848 %ifdef ASM_CALL64_GCC
1849 xchg A3, A2
1850 %1 [A0], A1_16, cl
1851 xchg A3, A2
1852 %else
1853 xchg A0, A2
1854 %1 [A2], A1_16, cl
1855 %endif
1856 IEM_SAVE_FLAGS A3, %2, %3
1857 EPILOGUE_4_ARGS
1858ENDPROC iemAImpl_ %+ %1 %+ _u16
1859
1860BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1861 PROLOGUE_4_ARGS
1862 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1863 %ifdef ASM_CALL64_GCC
1864 xchg A3, A2
1865 %1 [A0], A1_32, cl
1866 xchg A3, A2
1867 %else
1868 xchg A0, A2
1869 %1 [A2], A1_32, cl
1870 %endif
1871 IEM_SAVE_FLAGS A3, %2, %3
1872 EPILOGUE_4_ARGS
1873ENDPROC iemAImpl_ %+ %1 %+ _u32
1874
1875 %ifdef RT_ARCH_AMD64
1876BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1877 PROLOGUE_4_ARGS
1878 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1879 %ifdef ASM_CALL64_GCC
1880 xchg A3, A2
1881 %1 [A0], A1, cl
1882 xchg A3, A2
1883 %else
1884 xchg A0, A2
1885 %1 [A2], A1, cl
1886 %endif
1887 IEM_SAVE_FLAGS A3, %2, %3
1888 EPILOGUE_4_ARGS_EX 12
1889ENDPROC iemAImpl_ %+ %1 %+ _u64
1890 %endif ; RT_ARCH_AMD64
1891
1892%endmacro
1893
1894IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1895IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1896
1897
1898;;
1899; Macro for implementing a multiplication operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 so the caller can be used for div/idiv as well as
1909; for the mul/imul implementation.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 Name suffix.
1915; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1916;
1917; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1918;
1919%macro IEMIMPL_MUL_OP 5
1920BEGINCODE
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 mov al, [A0]
1925 %1 A1_8
1926 mov [A0], ax
1927 %if %5 != 1
1928 IEM_SAVE_FLAGS A2, %2, %3
1929 %else
1930 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1931 %endif
1932 xor eax, eax
1933 EPILOGUE_3_ARGS
1934ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1935
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1937 PROLOGUE_4_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1939 mov ax, [A0]
1940 %ifdef ASM_CALL64_GCC
1941 %1 A2_16
1942 mov [A0], ax
1943 mov [A1], dx
1944 %else
1945 mov T1, A1
1946 %1 A2_16
1947 mov [A0], ax
1948 mov [T1], dx
1949 %endif
1950 %if %5 != 1
1951 IEM_SAVE_FLAGS A3, %2, %3
1952 %else
1953 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1954 %endif
1955 xor eax, eax
1956 EPILOGUE_4_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1958
1959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1960 PROLOGUE_4_ARGS
1961 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1962 mov eax, [A0]
1963 %ifdef ASM_CALL64_GCC
1964 %1 A2_32
1965 mov [A0], eax
1966 mov [A1], edx
1967 %else
1968 mov T1, A1
1969 %1 A2_32
1970 mov [A0], eax
1971 mov [T1], edx
1972 %endif
1973 %if %5 != 1
1974 IEM_SAVE_FLAGS A3, %2, %3
1975 %else
1976 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1977 %endif
1978 xor eax, eax
1979 EPILOGUE_4_ARGS
1980ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1981
1982 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1984 PROLOGUE_4_ARGS
1985 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1986 mov rax, [A0]
1987 %ifdef ASM_CALL64_GCC
1988 %1 A2
1989 mov [A0], rax
1990 mov [A1], rdx
1991 %else
1992 mov T1, A1
1993 %1 A2
1994 mov [A0], rax
1995 mov [T1], rdx
1996 %endif
1997 %if %5 != 1
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 %else
2000 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2001 %endif
2002 xor eax, eax
2003 EPILOGUE_4_ARGS_EX 12
2004ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2005 %endif ; !RT_ARCH_AMD64
2006
2007%endmacro
2008
2009IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2010IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2011IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2012IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2013IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2014IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2015
2016
2017BEGINCODE
2018;;
2019; Worker function for negating a 32-bit number in T1:T0
2020; @uses None (T0,T1)
2021BEGINPROC iemAImpl_negate_T0_T1_u32
2022 push 0
2023 push 0
2024 xchg T0_32, [xSP]
2025 xchg T1_32, [xSP + xCB]
2026 sub T0_32, [xSP]
2027 sbb T1_32, [xSP + xCB]
2028 add xSP, xCB*2
2029 ret
2030ENDPROC iemAImpl_negate_T0_T1_u32
2031
2032%ifdef RT_ARCH_AMD64
2033;;
2034; Worker function for negating a 64-bit number in T1:T0
2035; @uses None (T0,T1)
2036BEGINPROC iemAImpl_negate_T0_T1_u64
2037 push 0
2038 push 0
2039 xchg T0, [xSP]
2040 xchg T1, [xSP + xCB]
2041 sub T0, [xSP]
2042 sbb T1, [xSP + xCB]
2043 add xSP, xCB*2
2044 ret
2045ENDPROC iemAImpl_negate_T0_T1_u64
2046%endif
2047
2048
2049;;
2050; Macro for implementing a division operations.
2051;
2052; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2053; 32-bit system where the 64-bit accesses requires hand coding.
2054;
2055; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2056; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2057; pointer to eflags in A3.
2058;
2059; The functions all return 0 on success and -1 if a divide error should be
2060; raised by the caller.
2061;
2062; @param 1 The instruction mnemonic.
2063; @param 2 The modified flags.
2064; @param 3 The undefined flags.
2065; @param 4 1 if signed, 0 if unsigned.
2066; @param 5 Function suffix.
2067; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2068; 2 for AMD (set AF, clear PF, ZF and SF).
2069;
2070; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2071;
2072%macro IEMIMPL_DIV_OP 6
2073BEGINCODE
2074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2075 PROLOGUE_3_ARGS
2076
2077 ; div by chainsaw check.
2078 test A1_8, A1_8
2079 jz .div_zero
2080
2081 ; Overflow check - unsigned division is simple to verify, haven't
2082 ; found a simple way to check signed division yet unfortunately.
2083 %if %4 == 0
2084 cmp [A0 + 1], A1_8
2085 jae .div_overflow
2086 %else
2087 mov T0_16, [A0] ; T0 = dividend
2088 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2089 test A1_8, A1_8
2090 js .divisor_negative
2091 test T0_16, T0_16
2092 jns .both_positive
2093 neg T0_16
2094.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2095 push T0 ; Start off like unsigned below.
2096 shr T0_16, 7
2097 cmp T0_8, A1_8
2098 pop T0
2099 jb .div_no_overflow
2100 ja .div_overflow
2101 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2102 cmp T0_8, A1_8
2103 jae .div_overflow
2104 jmp .div_no_overflow
2105
2106.divisor_negative:
2107 neg A1_8
2108 test T0_16, T0_16
2109 jns .one_of_each
2110 neg T0_16
2111.both_positive: ; Same as unsigned shifted by sign indicator bit.
2112 shr T0_16, 7
2113 cmp T0_8, A1_8
2114 jae .div_overflow
2115.div_no_overflow:
2116 mov A1, T1 ; restore divisor
2117 %endif
2118
2119 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2120 mov ax, [A0]
2121 %1 A1_8
2122 mov [A0], ax
2123 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2124 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2125 %else
2126 IEM_SAVE_FLAGS A2, %2, %3
2127 %endif
2128 xor eax, eax
2129
2130.return:
2131 EPILOGUE_3_ARGS
2132
2133.div_zero:
2134.div_overflow:
2135 mov eax, -1
2136 jmp .return
2137ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2138
2139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2140 PROLOGUE_4_ARGS
2141
2142 ; div by chainsaw check.
2143 test A2_16, A2_16
2144 jz .div_zero
2145
2146 ; Overflow check - unsigned division is simple to verify, haven't
2147 ; found a simple way to check signed division yet unfortunately.
2148 %if %4 == 0
2149 cmp [A1], A2_16
2150 jae .div_overflow
2151 %else
2152 mov T0_16, [A1]
2153 shl T0_32, 16
2154 mov T0_16, [A0] ; T0 = dividend
2155 mov T1, A2 ; T1 = divisor
2156 test T1_16, T1_16
2157 js .divisor_negative
2158 test T0_32, T0_32
2159 jns .both_positive
2160 neg T0_32
2161.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2162 push T0 ; Start off like unsigned below.
2163 shr T0_32, 15
2164 cmp T0_16, T1_16
2165 pop T0
2166 jb .div_no_overflow
2167 ja .div_overflow
2168 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2169 cmp T0_16, T1_16
2170 jae .div_overflow
2171 jmp .div_no_overflow
2172
2173.divisor_negative:
2174 neg T1_16
2175 test T0_32, T0_32
2176 jns .one_of_each
2177 neg T0_32
2178.both_positive: ; Same as unsigned shifted by sign indicator bit.
2179 shr T0_32, 15
2180 cmp T0_16, T1_16
2181 jae .div_overflow
2182.div_no_overflow:
2183 %endif
2184
2185 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2186 %ifdef ASM_CALL64_GCC
2187 mov T1, A2
2188 mov ax, [A0]
2189 mov dx, [A1]
2190 %1 T1_16
2191 mov [A0], ax
2192 mov [A1], dx
2193 %else
2194 mov T1, A1
2195 mov ax, [A0]
2196 mov dx, [T1]
2197 %1 A2_16
2198 mov [A0], ax
2199 mov [T1], dx
2200 %endif
2201 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2202 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2203 %else
2204 IEM_SAVE_FLAGS A3, %2, %3
2205 %endif
2206 xor eax, eax
2207
2208.return:
2209 EPILOGUE_4_ARGS
2210
2211.div_zero:
2212.div_overflow:
2213 mov eax, -1
2214 jmp .return
2215ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2216
2217BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2218 PROLOGUE_4_ARGS
2219
2220 ; div by chainsaw check.
2221 test A2_32, A2_32
2222 jz .div_zero
2223
2224 ; Overflow check - unsigned division is simple to verify, haven't
2225 ; found a simple way to check signed division yet unfortunately.
2226 %if %4 == 0
2227 cmp [A1], A2_32
2228 jae .div_overflow
2229 %else
2230 push A2 ; save A2 so we modify it (we out of regs on x86).
2231 mov T0_32, [A0] ; T0 = dividend low
2232 mov T1_32, [A1] ; T1 = dividend high
2233 test A2_32, A2_32
2234 js .divisor_negative
2235 test T1_32, T1_32
2236 jns .both_positive
2237 call NAME(iemAImpl_negate_T0_T1_u32)
2238.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2239 push T0 ; Start off like unsigned below.
2240 shl T1_32, 1
2241 shr T0_32, 31
2242 or T1_32, T0_32
2243 cmp T1_32, A2_32
2244 pop T0
2245 jb .div_no_overflow
2246 ja .div_overflow
2247 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2248 cmp T0_32, A2_32
2249 jae .div_overflow
2250 jmp .div_no_overflow
2251
2252.divisor_negative:
2253 neg A2_32
2254 test T1_32, T1_32
2255 jns .one_of_each
2256 call NAME(iemAImpl_negate_T0_T1_u32)
2257.both_positive: ; Same as unsigned shifted by sign indicator bit.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 jae .div_overflow
2263.div_no_overflow:
2264 pop A2
2265 %endif
2266
2267 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2268 mov eax, [A0]
2269 %ifdef ASM_CALL64_GCC
2270 mov T1, A2
2271 mov eax, [A0]
2272 mov edx, [A1]
2273 %1 T1_32
2274 mov [A0], eax
2275 mov [A1], edx
2276 %else
2277 mov T1, A1
2278 mov eax, [A0]
2279 mov edx, [T1]
2280 %1 A2_32
2281 mov [A0], eax
2282 mov [T1], edx
2283 %endif
2284 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2285 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2286 %else
2287 IEM_SAVE_FLAGS A3, %2, %3
2288 %endif
2289 xor eax, eax
2290
2291.return:
2292 EPILOGUE_4_ARGS
2293
2294.div_overflow:
2295 %if %4 != 0
2296 pop A2
2297 %endif
2298.div_zero:
2299 mov eax, -1
2300 jmp .return
2301ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2302
2303 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2305 PROLOGUE_4_ARGS
2306
2307 test A2, A2
2308 jz .div_zero
2309 %if %4 == 0
2310 cmp [A1], A2
2311 jae .div_overflow
2312 %else
2313 push A2 ; save A2 so we modify it (we out of regs on x86).
2314 mov T0, [A0] ; T0 = dividend low
2315 mov T1, [A1] ; T1 = dividend high
2316 test A2, A2
2317 js .divisor_negative
2318 test T1, T1
2319 jns .both_positive
2320 call NAME(iemAImpl_negate_T0_T1_u64)
2321.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2322 push T0 ; Start off like unsigned below.
2323 shl T1, 1
2324 shr T0, 63
2325 or T1, T0
2326 cmp T1, A2
2327 pop T0
2328 jb .div_no_overflow
2329 ja .div_overflow
2330 mov T1, 0x7fffffffffffffff
2331 and T0, T1 ; Special case for covering (divisor - 1).
2332 cmp T0, A2
2333 jae .div_overflow
2334 jmp .div_no_overflow
2335
2336.divisor_negative:
2337 neg A2
2338 test T1, T1
2339 jns .one_of_each
2340 call NAME(iemAImpl_negate_T0_T1_u64)
2341.both_positive: ; Same as unsigned shifted by sign indicator bit.
2342 shl T1, 1
2343 shr T0, 63
2344 or T1, T0
2345 cmp T1, A2
2346 jae .div_overflow
2347.div_no_overflow:
2348 pop A2
2349 %endif
2350
2351 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2352 mov rax, [A0]
2353 %ifdef ASM_CALL64_GCC
2354 mov T1, A2
2355 mov rax, [A0]
2356 mov rdx, [A1]
2357 %1 T1
2358 mov [A0], rax
2359 mov [A1], rdx
2360 %else
2361 mov T1, A1
2362 mov rax, [A0]
2363 mov rdx, [T1]
2364 %1 A2
2365 mov [A0], rax
2366 mov [T1], rdx
2367 %endif
2368 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2369 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2370 %else
2371 IEM_SAVE_FLAGS A3, %2, %3
2372 %endif
2373 xor eax, eax
2374
2375.return:
2376 EPILOGUE_4_ARGS_EX 12
2377
2378.div_overflow:
2379 %if %4 != 0
2380 pop A2
2381 %endif
2382.div_zero:
2383 mov eax, -1
2384 jmp .return
2385ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2386 %endif ; !RT_ARCH_AMD64
2387
2388%endmacro
2389
2390IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2391IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2392IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2393IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2394IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2395IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2396
2397
2398;;
2399; Macro for implementing memory fence operation.
2400;
2401; No return value, no operands or anything.
2402;
2403; @param 1 The instruction.
2404;
2405%macro IEMIMPL_MEM_FENCE 1
2406BEGINCODE
2407BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2408 %1
2409 ret
2410ENDPROC iemAImpl_ %+ %1
2411%endmacro
2412
2413IEMIMPL_MEM_FENCE lfence
2414IEMIMPL_MEM_FENCE sfence
2415IEMIMPL_MEM_FENCE mfence
2416
2417;;
2418; Alternative for non-SSE2 host.
2419;
2420BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2421 push xAX
2422 xchg xAX, [xSP]
2423 add xSP, xCB
2424 ret
2425ENDPROC iemAImpl_alt_mem_fence
2426
2427
2428;;
2429; Initialize the FPU for the actual instruction being emulated, this means
2430; loading parts of the guest's control word and status word.
2431;
2432; @uses 24 bytes of stack. T0, T1
2433; @param 1 Expression giving the address of the FXSTATE of the guest.
2434;
2435%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2436 fnstenv [xSP]
2437
2438 ; FCW - for exception, precision and rounding control.
2439 movzx T0, word [%1 + X86FXSTATE.FCW]
2440 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2441 mov [xSP + X86FSTENV32P.FCW], T0_16
2442
2443 ; FSW - for undefined C0, C1, C2, and C3.
2444 movzx T1, word [%1 + X86FXSTATE.FSW]
2445 and T1, X86_FSW_C_MASK
2446 movzx T0, word [xSP + X86FSTENV32P.FSW]
2447 and T0, X86_FSW_TOP_MASK
2448 or T0, T1
2449 mov [xSP + X86FSTENV32P.FSW], T0_16
2450
2451 fldenv [xSP]
2452%endmacro
2453
2454
2455;;
2456; Initialize the FPU for the actual instruction being emulated, this means
2457; loading parts of the guest's control word, status word, and update the
2458; tag word for the top register if it's empty.
2459;
2460; ASSUMES actual TOP=7
2461;
2462; @uses 24 bytes of stack. T0, T1
2463; @param 1 Expression giving the address of the FXSTATE of the guest.
2464;
2465%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2466 fnstenv [xSP]
2467
2468 ; FCW - for exception, precision and rounding control.
2469 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2470 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2471 mov [xSP + X86FSTENV32P.FCW], T0_16
2472
2473 ; FSW - for undefined C0, C1, C2, and C3.
2474 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2475 and T1_32, X86_FSW_C_MASK
2476 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2477 and T0_32, X86_FSW_TOP_MASK
2478 or T0_32, T1_32
2479 mov [xSP + X86FSTENV32P.FSW], T0_16
2480
2481 ; FTW - Only for ST0 (in/out).
2482 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2483 shr T1_32, X86_FSW_TOP_SHIFT
2484 and T1_32, X86_FSW_TOP_SMASK
2485 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2486 jc %%st0_not_empty
2487 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2488%%st0_not_empty:
2489
2490 fldenv [xSP]
2491%endmacro
2492
2493
2494;;
2495; Need to move this as well somewhere better?
2496;
2497struc IEMFPURESULT
2498 .r80Result resw 5
2499 .FSW resw 1
2500endstruc
2501
2502
2503;;
2504; Need to move this as well somewhere better?
2505;
2506struc IEMFPURESULTTWO
2507 .r80Result1 resw 5
2508 .FSW resw 1
2509 .r80Result2 resw 5
2510endstruc
2511
2512
2513;
2514;---------------------- 16-bit signed integer operations ----------------------
2515;
2516
2517
2518;;
2519; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2520;
2521; @param A0 FPU context (fxsave).
2522; @param A1 Pointer to a IEMFPURESULT for the output.
2523; @param A2 Pointer to the 16-bit floating point value to convert.
2524;
2525BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2526 PROLOGUE_3_ARGS
2527 sub xSP, 20h
2528
2529 fninit
2530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2531 fild word [A2]
2532
2533 fnstsw word [A1 + IEMFPURESULT.FSW]
2534 fnclex
2535 fstp tword [A1 + IEMFPURESULT.r80Result]
2536
2537 fninit
2538 add xSP, 20h
2539 EPILOGUE_3_ARGS
2540ENDPROC iemAImpl_fild_r80_from_i16
2541
2542
2543;;
2544; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2545;
2546; @param A0 FPU context (fxsave).
2547; @param A1 Where to return the output FSW.
2548; @param A2 Where to store the 16-bit signed integer value.
2549; @param A3 Pointer to the 80-bit value.
2550;
2551BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2552 PROLOGUE_4_ARGS
2553 sub xSP, 20h
2554
2555 fninit
2556 fld tword [A3]
2557 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2558 fistp word [A2]
2559
2560 fnstsw word [A1]
2561
2562 fninit
2563 add xSP, 20h
2564 EPILOGUE_4_ARGS
2565ENDPROC iemAImpl_fist_r80_to_i16
2566
2567
2568;;
2569; Store a 80-bit floating point value (register) as a 16-bit signed integer
2570; (memory) with truncation.
2571;
2572; @param A0 FPU context (fxsave).
2573; @param A1 Where to return the output FSW.
2574; @param A2 Where to store the 16-bit signed integer value.
2575; @param A3 Pointer to the 80-bit value.
2576;
2577BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2578 PROLOGUE_4_ARGS
2579 sub xSP, 20h
2580
2581 fninit
2582 fld tword [A3]
2583 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2584 fisttp word [A2]
2585
2586 fnstsw word [A1]
2587
2588 fninit
2589 add xSP, 20h
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_fistt_r80_to_i16
2592
2593
2594;;
2595; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2596;
2597; @param 1 The instruction
2598;
2599; @param A0 FPU context (fxsave).
2600; @param A1 Pointer to a IEMFPURESULT for the output.
2601; @param A2 Pointer to the 80-bit value.
2602; @param A3 Pointer to the 16-bit value.
2603;
2604%macro IEMIMPL_FPU_R80_BY_I16 1
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2606 PROLOGUE_4_ARGS
2607 sub xSP, 20h
2608
2609 fninit
2610 fld tword [A2]
2611 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2612 %1 word [A3]
2613
2614 fnstsw word [A1 + IEMFPURESULT.FSW]
2615 fnclex
2616 fstp tword [A1 + IEMFPURESULT.r80Result]
2617
2618 fninit
2619 add xSP, 20h
2620 EPILOGUE_4_ARGS
2621ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2622%endmacro
2623
2624IEMIMPL_FPU_R80_BY_I16 fiadd
2625IEMIMPL_FPU_R80_BY_I16 fimul
2626IEMIMPL_FPU_R80_BY_I16 fisub
2627IEMIMPL_FPU_R80_BY_I16 fisubr
2628IEMIMPL_FPU_R80_BY_I16 fidiv
2629IEMIMPL_FPU_R80_BY_I16 fidivr
2630
2631
2632;;
2633; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2634; only returning FSW.
2635;
2636; @param 1 The instruction
2637;
2638; @param A0 FPU context (fxsave).
2639; @param A1 Where to store the output FSW.
2640; @param A2 Pointer to the 80-bit value.
2641; @param A3 Pointer to the 64-bit value.
2642;
2643%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2645 PROLOGUE_4_ARGS
2646 sub xSP, 20h
2647
2648 fninit
2649 fld tword [A2]
2650 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2651 %1 word [A3]
2652
2653 fnstsw word [A1]
2654
2655 fninit
2656 add xSP, 20h
2657 EPILOGUE_4_ARGS
2658ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2659%endmacro
2660
2661IEMIMPL_FPU_R80_BY_I16_FSW ficom
2662
2663
2664
2665;
2666;---------------------- 32-bit signed integer operations ----------------------
2667;
2668
2669
2670;;
2671; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Pointer to a IEMFPURESULT for the output.
2675; @param A2 Pointer to the 32-bit floating point value to convert.
2676;
2677BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2678 PROLOGUE_3_ARGS
2679 sub xSP, 20h
2680
2681 fninit
2682 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2683 fild dword [A2]
2684
2685 fnstsw word [A1 + IEMFPURESULT.FSW]
2686 fnclex
2687 fstp tword [A1 + IEMFPURESULT.r80Result]
2688
2689 fninit
2690 add xSP, 20h
2691 EPILOGUE_3_ARGS
2692ENDPROC iemAImpl_fild_r80_from_i32
2693
2694
2695;;
2696; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2697;
2698; @param A0 FPU context (fxsave).
2699; @param A1 Where to return the output FSW.
2700; @param A2 Where to store the 32-bit signed integer value.
2701; @param A3 Pointer to the 80-bit value.
2702;
2703BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2704 PROLOGUE_4_ARGS
2705 sub xSP, 20h
2706
2707 fninit
2708 fld tword [A3]
2709 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2710 fistp dword [A2]
2711
2712 fnstsw word [A1]
2713
2714 fninit
2715 add xSP, 20h
2716 EPILOGUE_4_ARGS
2717ENDPROC iemAImpl_fist_r80_to_i32
2718
2719
2720;;
2721; Store a 80-bit floating point value (register) as a 32-bit signed integer
2722; (memory) with truncation.
2723;
2724; @param A0 FPU context (fxsave).
2725; @param A1 Where to return the output FSW.
2726; @param A2 Where to store the 32-bit signed integer value.
2727; @param A3 Pointer to the 80-bit value.
2728;
2729BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2730 PROLOGUE_4_ARGS
2731 sub xSP, 20h
2732
2733 fninit
2734 fld tword [A3]
2735 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2736 fisttp dword [A2]
2737
2738 fnstsw word [A1]
2739
2740 fninit
2741 add xSP, 20h
2742 EPILOGUE_4_ARGS
2743ENDPROC iemAImpl_fistt_r80_to_i32
2744
2745
2746;;
2747; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2748;
2749; @param 1 The instruction
2750;
2751; @param A0 FPU context (fxsave).
2752; @param A1 Pointer to a IEMFPURESULT for the output.
2753; @param A2 Pointer to the 80-bit value.
2754; @param A3 Pointer to the 32-bit value.
2755;
2756%macro IEMIMPL_FPU_R80_BY_I32 1
2757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2758 PROLOGUE_4_ARGS
2759 sub xSP, 20h
2760
2761 fninit
2762 fld tword [A2]
2763 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2764 %1 dword [A3]
2765
2766 fnstsw word [A1 + IEMFPURESULT.FSW]
2767 fnclex
2768 fstp tword [A1 + IEMFPURESULT.r80Result]
2769
2770 fninit
2771 add xSP, 20h
2772 EPILOGUE_4_ARGS
2773ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2774%endmacro
2775
2776IEMIMPL_FPU_R80_BY_I32 fiadd
2777IEMIMPL_FPU_R80_BY_I32 fimul
2778IEMIMPL_FPU_R80_BY_I32 fisub
2779IEMIMPL_FPU_R80_BY_I32 fisubr
2780IEMIMPL_FPU_R80_BY_I32 fidiv
2781IEMIMPL_FPU_R80_BY_I32 fidivr
2782
2783
2784;;
2785; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2786; only returning FSW.
2787;
2788; @param 1 The instruction
2789;
2790; @param A0 FPU context (fxsave).
2791; @param A1 Where to store the output FSW.
2792; @param A2 Pointer to the 80-bit value.
2793; @param A3 Pointer to the 64-bit value.
2794;
2795%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2796BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2797 PROLOGUE_4_ARGS
2798 sub xSP, 20h
2799
2800 fninit
2801 fld tword [A2]
2802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2803 %1 dword [A3]
2804
2805 fnstsw word [A1]
2806
2807 fninit
2808 add xSP, 20h
2809 EPILOGUE_4_ARGS
2810ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2811%endmacro
2812
2813IEMIMPL_FPU_R80_BY_I32_FSW ficom
2814
2815
2816
2817;
2818;---------------------- 64-bit signed integer operations ----------------------
2819;
2820
2821
2822;;
2823; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Pointer to a IEMFPURESULT for the output.
2827; @param A2 Pointer to the 64-bit floating point value to convert.
2828;
2829BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2830 PROLOGUE_3_ARGS
2831 sub xSP, 20h
2832
2833 fninit
2834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2835 fild qword [A2]
2836
2837 fnstsw word [A1 + IEMFPURESULT.FSW]
2838 fnclex
2839 fstp tword [A1 + IEMFPURESULT.r80Result]
2840
2841 fninit
2842 add xSP, 20h
2843 EPILOGUE_3_ARGS
2844ENDPROC iemAImpl_fild_r80_from_i64
2845
2846
2847;;
2848; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2849;
2850; @param A0 FPU context (fxsave).
2851; @param A1 Where to return the output FSW.
2852; @param A2 Where to store the 64-bit signed integer value.
2853; @param A3 Pointer to the 80-bit value.
2854;
2855BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2856 PROLOGUE_4_ARGS
2857 sub xSP, 20h
2858
2859 fninit
2860 fld tword [A3]
2861 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2862 fistp qword [A2]
2863
2864 fnstsw word [A1]
2865
2866 fninit
2867 add xSP, 20h
2868 EPILOGUE_4_ARGS
2869ENDPROC iemAImpl_fist_r80_to_i64
2870
2871
2872;;
2873; Store a 80-bit floating point value (register) as a 64-bit signed integer
2874; (memory) with truncation.
2875;
2876; @param A0 FPU context (fxsave).
2877; @param A1 Where to return the output FSW.
2878; @param A2 Where to store the 64-bit signed integer value.
2879; @param A3 Pointer to the 80-bit value.
2880;
2881BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2882 PROLOGUE_4_ARGS
2883 sub xSP, 20h
2884
2885 fninit
2886 fld tword [A3]
2887 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2888 fisttp qword [A2]
2889
2890 fnstsw word [A1]
2891
2892 fninit
2893 add xSP, 20h
2894 EPILOGUE_4_ARGS
2895ENDPROC iemAImpl_fistt_r80_to_i64
2896
2897
2898
2899;
2900;---------------------- 32-bit floating point operations ----------------------
2901;
2902
2903;;
2904; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2905;
2906; @param A0 FPU context (fxsave).
2907; @param A1 Pointer to a IEMFPURESULT for the output.
2908; @param A2 Pointer to the 32-bit floating point value to convert.
2909;
2910BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2911 PROLOGUE_3_ARGS
2912 sub xSP, 20h
2913
2914 fninit
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fld dword [A2]
2917
2918 fnstsw word [A1 + IEMFPURESULT.FSW]
2919 fnclex
2920 fstp tword [A1 + IEMFPURESULT.r80Result]
2921
2922 fninit
2923 add xSP, 20h
2924 EPILOGUE_3_ARGS
2925ENDPROC iemAImpl_fld_r80_from_r32
2926
2927
2928;;
2929; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Where to return the output FSW.
2933; @param A2 Where to store the 32-bit value.
2934; @param A3 Pointer to the 80-bit value.
2935;
2936BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2937 PROLOGUE_4_ARGS
2938 sub xSP, 20h
2939
2940 fninit
2941 fld tword [A3]
2942 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2943 fst dword [A2]
2944
2945 fnstsw word [A1]
2946
2947 fninit
2948 add xSP, 20h
2949 EPILOGUE_4_ARGS
2950ENDPROC iemAImpl_fst_r80_to_r32
2951
2952
2953;;
2954; FPU instruction working on one 80-bit and one 32-bit floating point value.
2955;
2956; @param 1 The instruction
2957;
2958; @param A0 FPU context (fxsave).
2959; @param A1 Pointer to a IEMFPURESULT for the output.
2960; @param A2 Pointer to the 80-bit value.
2961; @param A3 Pointer to the 32-bit value.
2962;
2963%macro IEMIMPL_FPU_R80_BY_R32 1
2964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2965 PROLOGUE_4_ARGS
2966 sub xSP, 20h
2967
2968 fninit
2969 fld tword [A2]
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 %1 dword [A3]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_4_ARGS
2980ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2981%endmacro
2982
2983IEMIMPL_FPU_R80_BY_R32 fadd
2984IEMIMPL_FPU_R80_BY_R32 fmul
2985IEMIMPL_FPU_R80_BY_R32 fsub
2986IEMIMPL_FPU_R80_BY_R32 fsubr
2987IEMIMPL_FPU_R80_BY_R32 fdiv
2988IEMIMPL_FPU_R80_BY_R32 fdivr
2989
2990
2991;;
2992; FPU instruction working on one 80-bit and one 32-bit floating point value,
2993; only returning FSW.
2994;
2995; @param 1 The instruction
2996;
2997; @param A0 FPU context (fxsave).
2998; @param A1 Where to store the output FSW.
2999; @param A2 Pointer to the 80-bit value.
3000; @param A3 Pointer to the 64-bit value.
3001;
3002%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3004 PROLOGUE_4_ARGS
3005 sub xSP, 20h
3006
3007 fninit
3008 fld tword [A2]
3009 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3010 %1 dword [A3]
3011
3012 fnstsw word [A1]
3013
3014 fninit
3015 add xSP, 20h
3016 EPILOGUE_4_ARGS
3017ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3018%endmacro
3019
3020IEMIMPL_FPU_R80_BY_R32_FSW fcom
3021
3022
3023
3024;
3025;---------------------- 64-bit floating point operations ----------------------
3026;
3027
3028;;
3029; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3030;
3031; @param A0 FPU context (fxsave).
3032; @param A1 Pointer to a IEMFPURESULT for the output.
3033; @param A2 Pointer to the 64-bit floating point value to convert.
3034;
3035BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3036 PROLOGUE_3_ARGS
3037 sub xSP, 20h
3038
3039 fninit
3040 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041 fld qword [A2]
3042
3043 fnstsw word [A1 + IEMFPURESULT.FSW]
3044 fnclex
3045 fstp tword [A1 + IEMFPURESULT.r80Result]
3046
3047 fninit
3048 add xSP, 20h
3049 EPILOGUE_3_ARGS
3050ENDPROC iemAImpl_fld_r80_from_r64
3051
3052
3053;;
3054; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3055;
3056; @param A0 FPU context (fxsave).
3057; @param A1 Where to return the output FSW.
3058; @param A2 Where to store the 64-bit value.
3059; @param A3 Pointer to the 80-bit value.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3062 PROLOGUE_4_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 fld tword [A3]
3067 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3068 fst qword [A2]
3069
3070 fnstsw word [A1]
3071
3072 fninit
3073 add xSP, 20h
3074 EPILOGUE_4_ARGS
3075ENDPROC iemAImpl_fst_r80_to_r64
3076
3077
3078;;
3079; FPU instruction working on one 80-bit and one 64-bit floating point value.
3080;
3081; @param 1 The instruction
3082;
3083; @param A0 FPU context (fxsave).
3084; @param A1 Pointer to a IEMFPURESULT for the output.
3085; @param A2 Pointer to the 80-bit value.
3086; @param A3 Pointer to the 64-bit value.
3087;
3088%macro IEMIMPL_FPU_R80_BY_R64 1
3089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3090 PROLOGUE_4_ARGS
3091 sub xSP, 20h
3092
3093 fninit
3094 fld tword [A2]
3095 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3096 %1 qword [A3]
3097
3098 fnstsw word [A1 + IEMFPURESULT.FSW]
3099 fnclex
3100 fstp tword [A1 + IEMFPURESULT.r80Result]
3101
3102 fninit
3103 add xSP, 20h
3104 EPILOGUE_4_ARGS
3105ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3106%endmacro
3107
3108IEMIMPL_FPU_R80_BY_R64 fadd
3109IEMIMPL_FPU_R80_BY_R64 fmul
3110IEMIMPL_FPU_R80_BY_R64 fsub
3111IEMIMPL_FPU_R80_BY_R64 fsubr
3112IEMIMPL_FPU_R80_BY_R64 fdiv
3113IEMIMPL_FPU_R80_BY_R64 fdivr
3114
3115;;
3116; FPU instruction working on one 80-bit and one 64-bit floating point value,
3117; only returning FSW.
3118;
3119; @param 1 The instruction
3120;
3121; @param A0 FPU context (fxsave).
3122; @param A1 Where to store the output FSW.
3123; @param A2 Pointer to the 80-bit value.
3124; @param A3 Pointer to the 64-bit value.
3125;
3126%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3127BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3128 PROLOGUE_4_ARGS
3129 sub xSP, 20h
3130
3131 fninit
3132 fld tword [A2]
3133 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3134 %1 qword [A3]
3135
3136 fnstsw word [A1]
3137
3138 fninit
3139 add xSP, 20h
3140 EPILOGUE_4_ARGS
3141ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3142%endmacro
3143
3144IEMIMPL_FPU_R80_BY_R64_FSW fcom
3145
3146
3147
3148;
3149;---------------------- 80-bit floating point operations ----------------------
3150;
3151
3152;;
3153; Loads a 80-bit floating point register value from memory.
3154;
3155; @param A0 FPU context (fxsave).
3156; @param A1 Pointer to a IEMFPURESULT for the output.
3157; @param A2 Pointer to the 80-bit floating point value to load.
3158;
3159BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3160 PROLOGUE_3_ARGS
3161 sub xSP, 20h
3162
3163 fninit
3164 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3165 fld tword [A2]
3166
3167 fnstsw word [A1 + IEMFPURESULT.FSW]
3168 fnclex
3169 fstp tword [A1 + IEMFPURESULT.r80Result]
3170
3171 fninit
3172 add xSP, 20h
3173 EPILOGUE_3_ARGS
3174ENDPROC iemAImpl_fld_r80_from_r80
3175
3176
3177;;
3178; Store a 80-bit floating point register to memory
3179;
3180; @param A0 FPU context (fxsave).
3181; @param A1 Where to return the output FSW.
3182; @param A2 Where to store the 80-bit value.
3183; @param A3 Pointer to the 80-bit register value.
3184;
3185BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3186 PROLOGUE_4_ARGS
3187 sub xSP, 20h
3188
3189 fninit
3190 fld tword [A3]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fstp tword [A2]
3193
3194 fnstsw word [A1]
3195
3196 fninit
3197 add xSP, 20h
3198 EPILOGUE_4_ARGS
3199ENDPROC iemAImpl_fst_r80_to_r80
3200
3201
3202;;
3203; Loads an 80-bit floating point register value in BCD format from memory.
3204;
3205; @param A0 FPU context (fxsave).
3206; @param A1 Pointer to a IEMFPURESULT for the output.
3207; @param A2 Pointer to the 80-bit BCD value to load.
3208;
3209BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3210 PROLOGUE_3_ARGS
3211 sub xSP, 20h
3212
3213 fninit
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fbld tword [A2]
3216
3217 fnstsw word [A1 + IEMFPURESULT.FSW]
3218 fnclex
3219 fstp tword [A1 + IEMFPURESULT.r80Result]
3220
3221 fninit
3222 add xSP, 20h
3223 EPILOGUE_3_ARGS
3224ENDPROC iemAImpl_fld_r80_from_d80
3225
3226
3227;;
3228; Store a 80-bit floating point register to memory as BCD
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Where to return the output FSW.
3232; @param A2 Where to store the 80-bit BCD value.
3233; @param A3 Pointer to the 80-bit register value.
3234;
3235BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3236 PROLOGUE_4_ARGS
3237 sub xSP, 20h
3238
3239 fninit
3240 fld tword [A3]
3241 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3242 fbstp tword [A2]
3243
3244 fnstsw word [A1]
3245
3246 fninit
3247 add xSP, 20h
3248 EPILOGUE_4_ARGS
3249ENDPROC iemAImpl_fst_r80_to_d80
3250
3251
3252;;
3253; FPU instruction working on two 80-bit floating point values.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Pointer to a IEMFPURESULT for the output.
3259; @param A2 Pointer to the first 80-bit value (ST0)
3260; @param A3 Pointer to the second 80-bit value (STn).
3261;
3262%macro IEMIMPL_FPU_R80_BY_R80 2
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A3]
3269 fld tword [A2]
3270 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3271 %1 %2
3272
3273 fnstsw word [A1 + IEMFPURESULT.FSW]
3274 fnclex
3275 fstp tword [A1 + IEMFPURESULT.r80Result]
3276
3277 fninit
3278 add xSP, 20h
3279 EPILOGUE_4_ARGS
3280ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3281%endmacro
3282
3283IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3284IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3285IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3286IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3287IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3288IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3289IEMIMPL_FPU_R80_BY_R80 fprem, {}
3290IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3291IEMIMPL_FPU_R80_BY_R80 fscale, {}
3292
3293
3294;;
3295; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3296; storing the result in ST1 and popping the stack.
3297;
3298; @param 1 The instruction
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the first 80-bit value (ST1).
3303; @param A3 Pointer to the second 80-bit value (ST0).
3304;
3305%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3307 PROLOGUE_4_ARGS
3308 sub xSP, 20h
3309
3310 fninit
3311 fld tword [A2]
3312 fld tword [A3]
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 %1
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_4_ARGS
3323ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3324%endmacro
3325
3326IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3327IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3328IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3329
3330
3331;;
3332; FPU instruction working on two 80-bit floating point values, only
3333; returning FSW.
3334;
3335; @param 1 The instruction
3336;
3337; @param A0 FPU context (fxsave).
3338; @param A1 Pointer to a uint16_t for the resulting FSW.
3339; @param A2 Pointer to the first 80-bit value.
3340; @param A3 Pointer to the second 80-bit value.
3341;
3342%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3344 PROLOGUE_4_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 fld tword [A3]
3349 fld tword [A2]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 %1 st0, st1
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_FSW fcom
3362IEMIMPL_FPU_R80_BY_R80_FSW fucom
3363
3364
3365;;
3366; FPU instruction working on two 80-bit floating point values,
3367; returning FSW and EFLAGS (eax).
3368;
3369; @param 1 The instruction
3370;
3371; @returns EFLAGS in EAX.
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st1
3387
3388 fnstsw word [A1]
3389 pushf
3390 pop xAX
3391
3392 fninit
3393 add xSP, 20h
3394 EPILOGUE_4_ARGS
3395ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3396%endmacro
3397
3398IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3399IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3400
3401
3402;;
3403; FPU instruction working on one 80-bit floating point value.
3404;
3405; @param 1 The instruction
3406;
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a IEMFPURESULT for the output.
3409; @param A2 Pointer to the 80-bit value.
3410;
3411%macro IEMIMPL_FPU_R80 1
3412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3413 PROLOGUE_3_ARGS
3414 sub xSP, 20h
3415
3416 fninit
3417 fld tword [A2]
3418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3419 %1
3420
3421 fnstsw word [A1 + IEMFPURESULT.FSW]
3422 fnclex
3423 fstp tword [A1 + IEMFPURESULT.r80Result]
3424
3425 fninit
3426 add xSP, 20h
3427 EPILOGUE_3_ARGS
3428ENDPROC iemAImpl_ %+ %1 %+ _r80
3429%endmacro
3430
3431IEMIMPL_FPU_R80 fchs
3432IEMIMPL_FPU_R80 fabs
3433IEMIMPL_FPU_R80 f2xm1
3434IEMIMPL_FPU_R80 fsqrt
3435IEMIMPL_FPU_R80 frndint
3436IEMIMPL_FPU_R80 fsin
3437IEMIMPL_FPU_R80 fcos
3438
3439
3440;;
3441; FPU instruction working on one 80-bit floating point value, only
3442; returning FSW.
3443;
3444; @param 1 The instruction
3445; @param 2 Non-zero to also restore FTW.
3446;
3447; @param A0 FPU context (fxsave).
3448; @param A1 Pointer to a uint16_t for the resulting FSW.
3449; @param A2 Pointer to the 80-bit value.
3450;
3451%macro IEMIMPL_FPU_R80_FSW 2
3452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3453 PROLOGUE_3_ARGS
3454 sub xSP, 20h
3455
3456 fninit
3457 fld tword [A2]
3458%if %2 != 0
3459 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3460%else
3461 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3462%endif
3463 %1
3464
3465 fnstsw word [A1]
3466
3467 fninit
3468 add xSP, 20h
3469 EPILOGUE_3_ARGS
3470ENDPROC iemAImpl_ %+ %1 %+ _r80
3471%endmacro
3472
3473IEMIMPL_FPU_R80_FSW ftst, 0
3474IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3475
3476
3477
3478;;
3479; FPU instruction loading a 80-bit floating point constant.
3480;
3481; @param 1 The instruction
3482;
3483; @param A0 FPU context (fxsave).
3484; @param A1 Pointer to a IEMFPURESULT for the output.
3485;
3486%macro IEMIMPL_FPU_R80_CONST 1
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3488 PROLOGUE_2_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1
3494
3495 fnstsw word [A1 + IEMFPURESULT.FSW]
3496 fnclex
3497 fstp tword [A1 + IEMFPURESULT.r80Result]
3498
3499 fninit
3500 add xSP, 20h
3501 EPILOGUE_2_ARGS
3502ENDPROC iemAImpl_ %+ %1 %+
3503%endmacro
3504
3505IEMIMPL_FPU_R80_CONST fld1
3506IEMIMPL_FPU_R80_CONST fldl2t
3507IEMIMPL_FPU_R80_CONST fldl2e
3508IEMIMPL_FPU_R80_CONST fldpi
3509IEMIMPL_FPU_R80_CONST fldlg2
3510IEMIMPL_FPU_R80_CONST fldln2
3511IEMIMPL_FPU_R80_CONST fldz
3512
3513
3514;;
3515; FPU instruction working on one 80-bit floating point value, outputing two.
3516;
3517; @param 1 The instruction
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3521; @param A2 Pointer to the 80-bit value.
3522;
3523%macro IEMIMPL_FPU_R80_R80 1
3524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3525 PROLOGUE_3_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1
3532
3533 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3534 fnclex
3535 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3536 fnclex
3537 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3538
3539 fninit
3540 add xSP, 20h
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3543%endmacro
3544
3545IEMIMPL_FPU_R80_R80 fptan
3546IEMIMPL_FPU_R80_R80 fxtract
3547IEMIMPL_FPU_R80_R80 fsincos
3548
3549
3550
3551
3552;---------------------- SSE and MMX Operations ----------------------
3553
3554;; @todo what do we need to do for MMX?
3555%macro IEMIMPL_MMX_PROLOGUE 0
3556%endmacro
3557%macro IEMIMPL_MMX_EPILOGUE 0
3558%endmacro
3559
3560;; @todo what do we need to do for SSE?
3561%macro IEMIMPL_SSE_PROLOGUE 0
3562%endmacro
3563%macro IEMIMPL_SSE_EPILOGUE 0
3564%endmacro
3565
3566;; @todo what do we need to do for AVX?
3567%macro IEMIMPL_AVX_PROLOGUE 0
3568%endmacro
3569%macro IEMIMPL_AVX_EPILOGUE 0
3570%endmacro
3571
3572
3573;;
3574; Media instruction working on two full sized registers.
3575;
3576; @param 1 The instruction
3577; @param 2 Whether there is an MMX variant (1) or not (0).
3578;
3579; @param A0 FPU context (fxsave).
3580; @param A1 Pointer to the first media register size operand (input/output).
3581; @param A2 Pointer to the second media register size operand (input).
3582;
3583%macro IEMIMPL_MEDIA_F2 2
3584%if %2 != 0
3585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3586 PROLOGUE_3_ARGS
3587 IEMIMPL_MMX_PROLOGUE
3588
3589 movq mm0, [A1]
3590 movq mm1, [A2]
3591 %1 mm0, mm1
3592 movq [A1], mm0
3593
3594 IEMIMPL_MMX_EPILOGUE
3595 EPILOGUE_3_ARGS
3596ENDPROC iemAImpl_ %+ %1 %+ _u64
3597%endif
3598
3599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3600 PROLOGUE_3_ARGS
3601 IEMIMPL_SSE_PROLOGUE
3602
3603 movdqu xmm0, [A1]
3604 movdqu xmm1, [A2]
3605 %1 xmm0, xmm1
3606 movdqu [A1], xmm0
3607
3608 IEMIMPL_SSE_EPILOGUE
3609 EPILOGUE_3_ARGS
3610ENDPROC iemAImpl_ %+ %1 %+ _u128
3611%endmacro
3612
3613IEMIMPL_MEDIA_F2 pshufb, 1
3614IEMIMPL_MEDIA_F2 pand, 1
3615IEMIMPL_MEDIA_F2 pandn, 1
3616IEMIMPL_MEDIA_F2 por, 1
3617IEMIMPL_MEDIA_F2 pxor, 1
3618IEMIMPL_MEDIA_F2 pcmpeqb, 1
3619IEMIMPL_MEDIA_F2 pcmpeqw, 1
3620IEMIMPL_MEDIA_F2 pcmpeqd, 1
3621IEMIMPL_MEDIA_F2 pcmpeqq, 0
3622IEMIMPL_MEDIA_F2 pcmpgtb, 1
3623IEMIMPL_MEDIA_F2 pcmpgtw, 1
3624IEMIMPL_MEDIA_F2 pcmpgtd, 1
3625IEMIMPL_MEDIA_F2 pcmpgtq, 0
3626IEMIMPL_MEDIA_F2 paddb, 1
3627IEMIMPL_MEDIA_F2 paddw, 1
3628IEMIMPL_MEDIA_F2 paddd, 1
3629IEMIMPL_MEDIA_F2 paddq, 1
3630IEMIMPL_MEDIA_F2 paddsb, 1
3631IEMIMPL_MEDIA_F2 paddsw, 1
3632IEMIMPL_MEDIA_F2 paddusb, 1
3633IEMIMPL_MEDIA_F2 paddusw, 1
3634IEMIMPL_MEDIA_F2 psubb, 1
3635IEMIMPL_MEDIA_F2 psubw, 1
3636IEMIMPL_MEDIA_F2 psubd, 1
3637IEMIMPL_MEDIA_F2 psubq, 1
3638IEMIMPL_MEDIA_F2 psubsb, 1
3639IEMIMPL_MEDIA_F2 psubsw, 1
3640IEMIMPL_MEDIA_F2 psubusb, 1
3641IEMIMPL_MEDIA_F2 psubusw, 1
3642IEMIMPL_MEDIA_F2 pmullw, 1
3643IEMIMPL_MEDIA_F2 pmulld, 0
3644IEMIMPL_MEDIA_F2 pmulhw, 1
3645IEMIMPL_MEDIA_F2 pmaddwd, 1
3646IEMIMPL_MEDIA_F2 pminub, 1
3647IEMIMPL_MEDIA_F2 pminuw, 0
3648IEMIMPL_MEDIA_F2 pminud, 0
3649IEMIMPL_MEDIA_F2 pminsb, 0
3650IEMIMPL_MEDIA_F2 pminsw, 1
3651IEMIMPL_MEDIA_F2 pminsd, 0
3652IEMIMPL_MEDIA_F2 pmaxub, 1
3653IEMIMPL_MEDIA_F2 pmaxuw, 0
3654IEMIMPL_MEDIA_F2 pmaxud, 0
3655IEMIMPL_MEDIA_F2 pmaxsb, 0
3656IEMIMPL_MEDIA_F2 pmaxsw, 1
3657IEMIMPL_MEDIA_F2 pmaxsd, 0
3658IEMIMPL_MEDIA_F2 pabsb, 1
3659IEMIMPL_MEDIA_F2 pabsw, 1
3660IEMIMPL_MEDIA_F2 pabsd, 1
3661IEMIMPL_MEDIA_F2 psignb, 1
3662IEMIMPL_MEDIA_F2 psignw, 1
3663IEMIMPL_MEDIA_F2 psignd, 1
3664IEMIMPL_MEDIA_F2 phaddw, 1
3665IEMIMPL_MEDIA_F2 phaddd, 1
3666IEMIMPL_MEDIA_F2 phsubw, 1
3667IEMIMPL_MEDIA_F2 phsubd, 1
3668IEMIMPL_MEDIA_F2 phaddsw, 1
3669IEMIMPL_MEDIA_F2 phsubsw, 1
3670IEMIMPL_MEDIA_F2 pmaddubsw, 1
3671IEMIMPL_MEDIA_F2 pmulhrsw, 1
3672IEMIMPL_MEDIA_F2 pmuludq, 1
3673
3674
3675;;
3676; Media instruction working on two full sized registers, but no FXSAVE state argument.
3677;
3678; @param 1 The instruction
3679; @param 2 Whether there is an MMX variant (1) or not (0).
3680;
3681; @param A0 Pointer to the first media register size operand (input/output).
3682; @param A1 Pointer to the second media register size operand (input).
3683;
3684%macro IEMIMPL_MEDIA_OPT_F2 2
3685%if %2 != 0
3686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3687 PROLOGUE_2_ARGS
3688 IEMIMPL_MMX_PROLOGUE
3689
3690 movq mm0, [A0]
3691 movq mm1, [A1]
3692 %1 mm0, mm1
3693 movq [A0], mm0
3694
3695 IEMIMPL_MMX_EPILOGUE
3696 EPILOGUE_2_ARGS
3697ENDPROC iemAImpl_ %+ %1 %+ _u64
3698%endif
3699
3700BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3701 PROLOGUE_2_ARGS
3702 IEMIMPL_SSE_PROLOGUE
3703
3704 movdqu xmm0, [A0]
3705 movdqu xmm1, [A1]
3706 %1 xmm0, xmm1
3707 movdqu [A0], xmm0
3708
3709 IEMIMPL_SSE_EPILOGUE
3710 EPILOGUE_2_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _u128
3712%endmacro
3713
3714IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3715IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3716IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3717IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3718IEMIMPL_MEDIA_OPT_F2 psllw, 1
3719IEMIMPL_MEDIA_OPT_F2 pslld, 1
3720IEMIMPL_MEDIA_OPT_F2 psllq, 1
3721IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3722IEMIMPL_MEDIA_OPT_F2 psrld, 1
3723IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3724IEMIMPL_MEDIA_OPT_F2 psraw, 1
3725IEMIMPL_MEDIA_OPT_F2 psrad, 1
3726IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3727IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3728IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3729IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3730IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3731IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3732IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3733IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3734IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3735IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3736IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3737IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3738IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3739IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3740IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3741
3742;;
3743; Media instruction working on one full sized and one half sized register (lower half).
3744;
3745; @param 1 The instruction
3746; @param 2 1 if MMX is included, 0 if not.
3747;
3748; @param A0 Pointer to the first full sized media register operand (input/output).
3749; @param A1 Pointer to the second half sized media register operand (input).
3750;
3751%macro IEMIMPL_MEDIA_F1L1 2
3752 %if %2 != 0
3753BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3754 PROLOGUE_2_ARGS
3755 IEMIMPL_MMX_PROLOGUE
3756
3757 movq mm0, [A0]
3758 movq mm1, [A1]
3759 %1 mm0, mm1
3760 movq [A0], mm0
3761
3762 IEMIMPL_MMX_EPILOGUE
3763 EPILOGUE_2_ARGS
3764ENDPROC iemAImpl_ %+ %1 %+ _u64
3765 %endif
3766
3767BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3768 PROLOGUE_2_ARGS
3769 IEMIMPL_SSE_PROLOGUE
3770
3771 movdqu xmm0, [A0]
3772 movdqu xmm1, [A1]
3773 %1 xmm0, xmm1
3774 movdqu [A0], xmm0
3775
3776 IEMIMPL_SSE_EPILOGUE
3777 EPILOGUE_2_ARGS
3778ENDPROC iemAImpl_ %+ %1 %+ _u128
3779%endmacro
3780
3781IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3782IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3783IEMIMPL_MEDIA_F1L1 punpckldq, 1
3784IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3785
3786
3787;;
3788; Media instruction working two half sized input registers (lower half) and a full sized
3789; destination register (vpunpckh*).
3790;
3791; @param 1 The instruction
3792;
3793; @param A0 Pointer to the destination register (full sized, output only).
3794; @param A1 Pointer to the first full sized media source register operand, where we
3795; will only use the lower half as input - but we'll be loading it in full.
3796; @param A2 Pointer to the second full sized media source register operand, where we
3797; will only use the lower half as input - but we'll be loading it in full.
3798;
3799%macro IEMIMPL_MEDIA_F1L1L1 1
3800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3801 PROLOGUE_3_ARGS
3802 IEMIMPL_AVX_PROLOGUE
3803
3804 vmovdqu xmm0, [A1]
3805 vmovdqu xmm1, [A2]
3806 %1 xmm0, xmm0, xmm1
3807 vmovdqu [A0], xmm0
3808
3809 IEMIMPL_AVX_PROLOGUE
3810 EPILOGUE_3_ARGS
3811ENDPROC iemAImpl_ %+ %1 %+ _u128
3812
3813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3814 PROLOGUE_3_ARGS
3815 IEMIMPL_AVX_PROLOGUE
3816
3817 vmovdqu ymm0, [A1]
3818 vmovdqu ymm1, [A2]
3819 %1 ymm0, ymm0, ymm1
3820 vmovdqu [A0], ymm0
3821
3822 IEMIMPL_AVX_PROLOGUE
3823 EPILOGUE_3_ARGS
3824ENDPROC iemAImpl_ %+ %1 %+ _u256
3825%endmacro
3826
3827IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3828IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3829IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3830IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3831
3832
3833;;
3834; Media instruction working on one full sized and one half sized register (high half).
3835;
3836; @param 1 The instruction
3837; @param 2 1 if MMX is included, 0 if not.
3838;
3839; @param A0 Pointer to the first full sized media register operand (input/output).
3840; @param A1 Pointer to the second full sized media register operand, where we
3841; will only use the upper half as input - but we'll load it in full.
3842;
3843%macro IEMIMPL_MEDIA_F1H1 2
3844IEMIMPL_MEDIA_F1L1 %1, %2
3845%endmacro
3846
3847IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3848IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3849IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3850IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3851
3852
3853;;
3854; Media instruction working two half sized input registers (high half) and a full sized
3855; destination register (vpunpckh*).
3856;
3857; @param 1 The instruction
3858;
3859; @param A0 Pointer to the destination register (full sized, output only).
3860; @param A1 Pointer to the first full sized media source register operand, where we
3861; will only use the upper half as input - but we'll be loading it in full.
3862; @param A2 Pointer to the second full sized media source register operand, where we
3863; will only use the upper half as input - but we'll be loading it in full.
3864;
3865%macro IEMIMPL_MEDIA_F1H1H1 1
3866IEMIMPL_MEDIA_F1L1L1 %1
3867%endmacro
3868
3869IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3870IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3871IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3872IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3873
3874
3875;
3876; Shufflers with evil 8-bit immediates.
3877;
3878
3879BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3880 PROLOGUE_3_ARGS
3881 IEMIMPL_MMX_PROLOGUE
3882
3883 movq mm1, [A1]
3884 movq mm0, mm0 ; paranoia!
3885 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3886 lea T1, [.imm0 xWrtRIP]
3887 lea T1, [T1 + T0]
3888 call T1
3889 movq [A0], mm0
3890
3891 IEMIMPL_MMX_EPILOGUE
3892 EPILOGUE_3_ARGS
3893%assign bImm 0
3894%rep 256
3895.imm %+ bImm:
3896 pshufw mm0, mm1, bImm
3897 ret
3898 %assign bImm bImm + 1
3899%endrep
3900.immEnd: ; 256*5 == 0x500
3901dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3902dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3903ENDPROC iemAImpl_pshufw_u64
3904
3905
3906%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3908 PROLOGUE_3_ARGS
3909 IEMIMPL_SSE_PROLOGUE
3910
3911 movdqu xmm1, [A1]
3912 movdqu xmm0, xmm1 ; paranoia!
3913 lea T1, [.imm0 xWrtRIP]
3914 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3915 lea T1, [T1 + T0*2]
3916 call T1
3917 movdqu [A0], xmm0
3918
3919 IEMIMPL_SSE_EPILOGUE
3920 EPILOGUE_3_ARGS
3921 %assign bImm 0
3922 %rep 256
3923.imm %+ bImm:
3924 %1 xmm0, xmm1, bImm
3925 ret
3926 %assign bImm bImm + 1
3927 %endrep
3928.immEnd: ; 256*6 == 0x600
3929dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3930dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3931ENDPROC iemAImpl_ %+ %1 %+ _u128
3932%endmacro
3933
3934IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3935IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3936IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3937
3938
3939%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3940BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3941 PROLOGUE_3_ARGS
3942 IEMIMPL_SSE_PROLOGUE
3943
3944 vmovdqu ymm1, [A1]
3945 vmovdqu ymm0, ymm1 ; paranoia!
3946 lea T1, [.imm0 xWrtRIP]
3947 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3948 lea T1, [T1 + T0*2]
3949 call T1
3950 vmovdqu [A0], ymm0
3951
3952 IEMIMPL_SSE_EPILOGUE
3953 EPILOGUE_3_ARGS
3954 %assign bImm 0
3955 %rep 256
3956.imm %+ bImm:
3957 %1 ymm0, ymm1, bImm
3958 ret
3959 %assign bImm bImm + 1
3960 %endrep
3961.immEnd: ; 256*6 == 0x600
3962dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3963dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3964ENDPROC iemAImpl_ %+ %1 %+ _u256
3965%endmacro
3966
3967IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3968IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3969IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3970
3971
3972;
3973; Shifts with evil 8-bit immediates.
3974;
3975
3976%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3978 PROLOGUE_2_ARGS
3979 IEMIMPL_MMX_PROLOGUE
3980
3981 movq mm0, [A0]
3982 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3983 lea T1, [.imm0 xWrtRIP]
3984 lea T1, [T1 + T0]
3985 call T1
3986 movq [A0], mm0
3987
3988 IEMIMPL_MMX_EPILOGUE
3989 EPILOGUE_2_ARGS
3990%assign bImm 0
3991%rep 256
3992.imm %+ bImm:
3993 %1 mm0, bImm
3994 ret
3995 %assign bImm bImm + 1
3996%endrep
3997.immEnd: ; 256*5 == 0x500
3998dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3999dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4000ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4001%endmacro
4002
4003IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4004IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4005IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4006IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4007IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4008IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4009IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4010IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4011
4012
4013%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4015 PROLOGUE_2_ARGS
4016 IEMIMPL_SSE_PROLOGUE
4017
4018 movdqu xmm0, [A0]
4019 lea T1, [.imm0 xWrtRIP]
4020 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4021 lea T1, [T1 + T0*2]
4022 call T1
4023 movdqu [A0], xmm0
4024
4025 IEMIMPL_SSE_EPILOGUE
4026 EPILOGUE_2_ARGS
4027 %assign bImm 0
4028 %rep 256
4029.imm %+ bImm:
4030 %1 xmm0, bImm
4031 ret
4032 %assign bImm bImm + 1
4033 %endrep
4034.immEnd: ; 256*6 == 0x600
4035dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4036dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4037ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4038%endmacro
4039
4040IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4041IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4042IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4043IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4044IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4045IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4046IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4047IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4048IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4049IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4050
4051
4052;
4053; Move byte mask.
4054;
4055
4056BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4057 PROLOGUE_2_ARGS
4058 IEMIMPL_MMX_PROLOGUE
4059
4060 movq mm1, [A1]
4061 pmovmskb T0, mm1
4062 mov [A0], T0
4063%ifdef RT_ARCH_X86
4064 mov dword [A0 + 4], 0
4065%endif
4066 IEMIMPL_MMX_EPILOGUE
4067 EPILOGUE_2_ARGS
4068ENDPROC iemAImpl_pmovmskb_u64
4069
4070BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4071 PROLOGUE_2_ARGS
4072 IEMIMPL_SSE_PROLOGUE
4073
4074 movdqu xmm1, [A1]
4075 pmovmskb T0, xmm1
4076 mov [A0], T0
4077%ifdef RT_ARCH_X86
4078 mov dword [A0 + 4], 0
4079%endif
4080 IEMIMPL_SSE_EPILOGUE
4081 EPILOGUE_2_ARGS
4082ENDPROC iemAImpl_pmovmskb_u128
4083
4084BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4085 PROLOGUE_2_ARGS
4086 IEMIMPL_AVX_PROLOGUE
4087
4088 vmovdqu ymm1, [A1]
4089 vpmovmskb T0, ymm1
4090 mov [A0], T0
4091%ifdef RT_ARCH_X86
4092 mov dword [A0 + 4], 0
4093%endif
4094 IEMIMPL_AVX_EPILOGUE
4095 EPILOGUE_2_ARGS
4096ENDPROC iemAImpl_vpmovmskb_u256
4097
4098
4099;;
4100; Media instruction working on two full sized source registers and one destination (AVX).
4101;
4102; @param 1 The instruction
4103;
4104; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4105; @param A1 Pointer to the destination media register size operand (output).
4106; @param A2 Pointer to the first source media register size operand (input).
4107; @param A3 Pointer to the second source media register size operand (input).
4108;
4109%macro IEMIMPL_MEDIA_F3 1
4110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4111 PROLOGUE_4_ARGS
4112 IEMIMPL_AVX_PROLOGUE
4113
4114 vmovdqu xmm0, [A2]
4115 vmovdqu xmm1, [A3]
4116 %1 xmm0, xmm0, xmm1
4117 vmovdqu [A1], xmm0
4118
4119 IEMIMPL_AVX_PROLOGUE
4120 EPILOGUE_4_ARGS
4121ENDPROC iemAImpl_ %+ %1 %+ _u128
4122
4123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4124 PROLOGUE_4_ARGS
4125 IEMIMPL_AVX_PROLOGUE
4126
4127 vmovdqu ymm0, [A2]
4128 vmovdqu ymm1, [A3]
4129 %1 ymm0, ymm0, ymm1
4130 vmovdqu [A1], ymm0
4131
4132 IEMIMPL_AVX_PROLOGUE
4133 EPILOGUE_4_ARGS
4134ENDPROC iemAImpl_ %+ %1 %+ _u256
4135%endmacro
4136
4137IEMIMPL_MEDIA_F3 vpshufb
4138IEMIMPL_MEDIA_F3 vpand
4139IEMIMPL_MEDIA_F3 vpminub
4140IEMIMPL_MEDIA_F3 vpminuw
4141IEMIMPL_MEDIA_F3 vpminud
4142IEMIMPL_MEDIA_F3 vpminsb
4143IEMIMPL_MEDIA_F3 vpminsw
4144IEMIMPL_MEDIA_F3 vpminsd
4145IEMIMPL_MEDIA_F3 vpmaxub
4146IEMIMPL_MEDIA_F3 vpmaxuw
4147IEMIMPL_MEDIA_F3 vpmaxud
4148IEMIMPL_MEDIA_F3 vpmaxsb
4149IEMIMPL_MEDIA_F3 vpmaxsw
4150IEMIMPL_MEDIA_F3 vpmaxsd
4151IEMIMPL_MEDIA_F3 vpandn
4152IEMIMPL_MEDIA_F3 vpor
4153IEMIMPL_MEDIA_F3 vpxor
4154IEMIMPL_MEDIA_F3 vpcmpeqb
4155IEMIMPL_MEDIA_F3 vpcmpeqw
4156IEMIMPL_MEDIA_F3 vpcmpeqd
4157IEMIMPL_MEDIA_F3 vpcmpeqq
4158IEMIMPL_MEDIA_F3 vpcmpgtb
4159IEMIMPL_MEDIA_F3 vpcmpgtw
4160IEMIMPL_MEDIA_F3 vpcmpgtd
4161IEMIMPL_MEDIA_F3 vpcmpgtq
4162IEMIMPL_MEDIA_F3 vpaddb
4163IEMIMPL_MEDIA_F3 vpaddw
4164IEMIMPL_MEDIA_F3 vpaddd
4165IEMIMPL_MEDIA_F3 vpaddq
4166IEMIMPL_MEDIA_F3 vpsubb
4167IEMIMPL_MEDIA_F3 vpsubw
4168IEMIMPL_MEDIA_F3 vpsubd
4169IEMIMPL_MEDIA_F3 vpsubq
4170
4171
4172;;
4173; Media instruction working on two full sized source registers and one destination (AVX),
4174; but no XSAVE state pointer argument.
4175;
4176; @param 1 The instruction
4177;
4178; @param A0 Pointer to the destination media register size operand (output).
4179; @param A1 Pointer to the first source media register size operand (input).
4180; @param A2 Pointer to the second source media register size operand (input).
4181;
4182%macro IEMIMPL_MEDIA_OPT_F3 1
4183BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4184 PROLOGUE_3_ARGS
4185 IEMIMPL_AVX_PROLOGUE
4186
4187 vmovdqu xmm0, [A1]
4188 vmovdqu xmm1, [A2]
4189 %1 xmm0, xmm0, xmm1
4190 vmovdqu [A0], xmm0
4191
4192 IEMIMPL_AVX_PROLOGUE
4193 EPILOGUE_3_ARGS
4194ENDPROC iemAImpl_ %+ %1 %+ _u128
4195
4196BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4197 PROLOGUE_3_ARGS
4198 IEMIMPL_AVX_PROLOGUE
4199
4200 vmovdqu ymm0, [A1]
4201 vmovdqu ymm1, [A2]
4202 %1 ymm0, ymm0, ymm1
4203 vmovdqu [A0], ymm0
4204
4205 IEMIMPL_AVX_PROLOGUE
4206 EPILOGUE_3_ARGS
4207ENDPROC iemAImpl_ %+ %1 %+ _u256
4208%endmacro
4209
4210IEMIMPL_MEDIA_OPT_F3 vpacksswb
4211IEMIMPL_MEDIA_OPT_F3 vpackssdw
4212IEMIMPL_MEDIA_OPT_F3 vpackuswb
4213IEMIMPL_MEDIA_OPT_F3 vpackusdw
4214IEMIMPL_MEDIA_OPT_F3 vpmullw
4215IEMIMPL_MEDIA_OPT_F3 vpmulld
4216IEMIMPL_MEDIA_OPT_F3 vpmulhw
4217IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4218IEMIMPL_MEDIA_OPT_F3 vpavgb
4219IEMIMPL_MEDIA_OPT_F3 vpavgw
4220IEMIMPL_MEDIA_OPT_F3 vpsignb
4221IEMIMPL_MEDIA_OPT_F3 vpsignw
4222IEMIMPL_MEDIA_OPT_F3 vpsignd
4223IEMIMPL_MEDIA_OPT_F3 vphaddw
4224IEMIMPL_MEDIA_OPT_F3 vphaddd
4225IEMIMPL_MEDIA_OPT_F3 vphsubw
4226IEMIMPL_MEDIA_OPT_F3 vphsubd
4227IEMIMPL_MEDIA_OPT_F3 vphaddsw
4228IEMIMPL_MEDIA_OPT_F3 vphsubsw
4229IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4230IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4231IEMIMPL_MEDIA_OPT_F3 vpsadbw
4232IEMIMPL_MEDIA_OPT_F3 vpmuldq
4233IEMIMPL_MEDIA_OPT_F3 vpmuludq
4234IEMIMPL_MEDIA_OPT_F3 vunpcklps
4235IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4236IEMIMPL_MEDIA_OPT_F3 vunpckhps
4237IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4238
4239;;
4240; Media instruction working on one full sized source registers and one destination (AVX),
4241; but no XSAVE state pointer argument.
4242;
4243; @param 1 The instruction
4244; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4245;
4246; @param A0 Pointer to the destination media register size operand (output).
4247; @param A1 Pointer to the source media register size operand (input).
4248;
4249%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4251 PROLOGUE_2_ARGS
4252 IEMIMPL_AVX_PROLOGUE
4253
4254 vmovdqu xmm0, [A1]
4255 %1 xmm0, xmm0
4256 vmovdqu [A0], xmm0
4257
4258 IEMIMPL_AVX_PROLOGUE
4259 EPILOGUE_2_ARGS
4260ENDPROC iemAImpl_ %+ %1 %+ _u128
4261
4262 %if %2 == 1
4263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4264 PROLOGUE_2_ARGS
4265 IEMIMPL_AVX_PROLOGUE
4266
4267 vmovdqu ymm0, [A1]
4268 %1 ymm0, ymm0
4269 vmovdqu [A0], ymm0
4270
4271 IEMIMPL_AVX_PROLOGUE
4272 EPILOGUE_2_ARGS
4273ENDPROC iemAImpl_ %+ %1 %+ _u256
4274 %endif
4275%endmacro
4276
4277IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4278IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4279IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4280IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4281
4282
4283;
4284; The SSE 4.2 crc32
4285;
4286; @param A1 Pointer to the 32-bit destination.
4287; @param A2 The source operand, sized according to the suffix.
4288;
4289BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4290 PROLOGUE_2_ARGS
4291
4292 mov T0_32, [A0]
4293 crc32 T0_32, A1_8
4294 mov [A0], T0_32
4295
4296 EPILOGUE_2_ARGS
4297ENDPROC iemAImpl_crc32_u8
4298
4299BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4300 PROLOGUE_2_ARGS
4301
4302 mov T0_32, [A0]
4303 crc32 T0_32, A1_16
4304 mov [A0], T0_32
4305
4306 EPILOGUE_2_ARGS
4307ENDPROC iemAImpl_crc32_u16
4308
4309BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4310 PROLOGUE_2_ARGS
4311
4312 mov T0_32, [A0]
4313 crc32 T0_32, A1_32
4314 mov [A0], T0_32
4315
4316 EPILOGUE_2_ARGS
4317ENDPROC iemAImpl_crc32_u32
4318
4319%ifdef RT_ARCH_AMD64
4320BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4321 PROLOGUE_2_ARGS
4322
4323 mov T0_32, [A0]
4324 crc32 T0, A1
4325 mov [A0], T0_32
4326
4327 EPILOGUE_2_ARGS
4328ENDPROC iemAImpl_crc32_u64
4329%endif
4330
4331
4332;
4333; PTEST (SSE 4.1)
4334;
4335; @param A0 Pointer to the first source operand (aka readonly destination).
4336; @param A1 Pointer to the second source operand.
4337; @param A2 Pointer to the EFLAGS register.
4338;
4339BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4340 PROLOGUE_3_ARGS
4341 IEMIMPL_SSE_PROLOGUE
4342
4343 movdqu xmm0, [A0]
4344 movdqu xmm1, [A1]
4345 ptest xmm0, xmm1
4346 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4347
4348 IEMIMPL_SSE_EPILOGUE
4349 EPILOGUE_3_ARGS
4350ENDPROC iemAImpl_ptest_u128
4351
4352BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4353 PROLOGUE_3_ARGS
4354 IEMIMPL_SSE_PROLOGUE
4355
4356 vmovdqu ymm0, [A0]
4357 vmovdqu ymm1, [A1]
4358 vptest ymm0, ymm1
4359 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4360
4361 IEMIMPL_SSE_EPILOGUE
4362 EPILOGUE_3_ARGS
4363ENDPROC iemAImpl_vptest_u256
4364
4365
4366;;
4367; Template for the [v]pmov{s,z}x* instructions
4368;
4369; @param 1 The instruction
4370;
4371; @param A0 Pointer to the destination media register size operand (output).
4372; @param A1 The source operand value (input).
4373;
4374%macro IEMIMPL_V_PMOV_SZ_X 1
4375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4376 PROLOGUE_2_ARGS
4377 IEMIMPL_SSE_PROLOGUE
4378
4379 movd xmm0, A1
4380 %1 xmm0, xmm0
4381 vmovdqu [A0], xmm0
4382
4383 IEMIMPL_SSE_PROLOGUE
4384 EPILOGUE_2_ARGS
4385ENDPROC iemAImpl_ %+ %1 %+ _u128
4386
4387BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4388 PROLOGUE_2_ARGS
4389 IEMIMPL_AVX_PROLOGUE
4390
4391 movd xmm0, A1
4392 v %+ %1 xmm0, xmm0
4393 vmovdqu [A0], xmm0
4394
4395 IEMIMPL_AVX_PROLOGUE
4396 EPILOGUE_2_ARGS
4397ENDPROC iemAImpl_v %+ %1 %+ _u128
4398
4399BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4400 PROLOGUE_2_ARGS
4401 IEMIMPL_AVX_PROLOGUE
4402
4403 movdqu xmm0, [A1]
4404 v %+ %1 ymm0, xmm0
4405 vmovdqu [A0], ymm0
4406
4407 IEMIMPL_AVX_PROLOGUE
4408 EPILOGUE_2_ARGS
4409ENDPROC iemAImpl_v %+ %1 %+ _u256
4410%endmacro
4411
4412IEMIMPL_V_PMOV_SZ_X pmovsxbw
4413IEMIMPL_V_PMOV_SZ_X pmovsxbd
4414IEMIMPL_V_PMOV_SZ_X pmovsxbq
4415IEMIMPL_V_PMOV_SZ_X pmovsxwd
4416IEMIMPL_V_PMOV_SZ_X pmovsxwq
4417IEMIMPL_V_PMOV_SZ_X pmovsxdq
4418
4419IEMIMPL_V_PMOV_SZ_X pmovzxbw
4420IEMIMPL_V_PMOV_SZ_X pmovzxbd
4421IEMIMPL_V_PMOV_SZ_X pmovzxbq
4422IEMIMPL_V_PMOV_SZ_X pmovzxwd
4423IEMIMPL_V_PMOV_SZ_X pmovzxwq
4424IEMIMPL_V_PMOV_SZ_X pmovzxdq
4425
4426
4427;;
4428; Need to move this as well somewhere better?
4429;
4430struc IEMSSERESULT
4431 .uResult resd 4
4432 .MXCSR resd 1
4433endstruc
4434
4435
4436;;
4437; Need to move this as well somewhere better?
4438;
4439struc IEMAVX128RESULT
4440 .uResult resd 4
4441 .MXCSR resd 1
4442endstruc
4443
4444
4445;;
4446; Need to move this as well somewhere better?
4447;
4448struc IEMAVX256RESULT
4449 .uResult resd 8
4450 .MXCSR resd 1
4451endstruc
4452
4453
4454;;
4455; Initialize the SSE MXCSR register using the guest value partially to
4456; account for rounding mode.
4457;
4458; @uses 4 bytes of stack to save the original value, T0.
4459; @param 1 Expression giving the address of the FXSTATE of the guest.
4460;
4461%macro SSE_LD_FXSTATE_MXCSR 1
4462 sub xSP, 4
4463
4464 stmxcsr [xSP]
4465 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4466 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4467 or T0_32, X86_MXCSR_XCPT_MASK
4468 sub xSP, 4
4469 mov [xSP], T0_32
4470 ldmxcsr [xSP]
4471 add xSP, 4
4472%endmacro
4473
4474
4475;;
4476; Restores the SSE MXCSR register with the original value.
4477;
4478; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4479; @param 1 Expression giving the address where to return the MXCSR value.
4480; @param 2 Expression giving the address of the FXSTATE of the guest.
4481;
4482; @note Restores the stack pointer.
4483;
4484%macro SSE_ST_FXSTATE_MXCSR 2
4485 sub xSP, 4
4486 stmxcsr [xSP]
4487 mov T0_32, [xSP]
4488 add xSP, 4
4489 ; Merge the status bits into the original MXCSR value.
4490 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4491 and T0_32, X86_MXCSR_XCPT_FLAGS
4492 or T0_32, T1_32
4493 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4494
4495 ldmxcsr [xSP]
4496 add xSP, 4
4497%endmacro
4498
4499
4500;;
4501; Initialize the SSE MXCSR register using the guest value partially to
4502; account for rounding mode.
4503;
4504; @uses 4 bytes of stack to save the original value.
4505; @param 1 Expression giving the address of the FXSTATE of the guest.
4506;
4507%macro AVX_LD_XSAVEAREA_MXCSR 1
4508 sub xSP, 4
4509
4510 stmxcsr [xSP]
4511 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4512 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4513 sub xSP, 4
4514 mov [xSP], T0_32
4515 ldmxcsr [xSP]
4516 add xSP, 4
4517%endmacro
4518
4519
4520;;
4521; Restores the AVX128 MXCSR register with the original value.
4522;
4523; @param 1 Expression giving the address where to return the MXCSR value.
4524;
4525; @note Restores the stack pointer.
4526;
4527%macro AVX128_ST_XSAVEAREA_MXCSR 1
4528 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4529
4530 ldmxcsr [xSP]
4531 add xSP, 4
4532%endmacro
4533
4534
4535;;
4536; Restores the AVX256 MXCSR register with the original value.
4537;
4538; @param 1 Expression giving the address where to return the MXCSR value.
4539;
4540; @note Restores the stack pointer.
4541;
4542%macro AVX256_ST_XSAVEAREA_MXCSR 1
4543 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4544
4545 ldmxcsr [xSP]
4546 add xSP, 4
4547%endmacro
4548
4549
4550;;
4551; Floating point instruction working on two full sized registers.
4552;
4553; @param 1 The instruction
4554; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4555;
4556; @param A0 FPU context (FXSTATE or XSAVEAREA).
4557; @param A1 Where to return the result including the MXCSR value.
4558; @param A2 Pointer to the first media register size operand (input/output).
4559; @param A3 Pointer to the second media register size operand (input).
4560;
4561%macro IEMIMPL_FP_F2 2
4562BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4563 PROLOGUE_4_ARGS
4564 IEMIMPL_SSE_PROLOGUE
4565 SSE_LD_FXSTATE_MXCSR A0
4566
4567 movdqu xmm0, [A2]
4568 movdqu xmm1, [A3]
4569 %1 xmm0, xmm1
4570 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4571
4572 SSE_ST_FXSTATE_MXCSR A1, A0
4573 IEMIMPL_SSE_PROLOGUE
4574 EPILOGUE_4_ARGS
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576
4577 %if %2 == 3
4578BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4579 PROLOGUE_4_ARGS
4580 IEMIMPL_AVX_PROLOGUE
4581 AVX_LD_XSAVEAREA_MXCSR A0
4582
4583 vmovdqu xmm0, [A2]
4584 vmovdqu xmm1, [A3]
4585 v %+ %1 xmm0, xmm0, xmm1
4586 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4587
4588 AVX128_ST_XSAVEAREA_MXCSR A1
4589 IEMIMPL_AVX_PROLOGUE
4590 EPILOGUE_4_ARGS
4591ENDPROC iemAImpl_v %+ %1 %+ _u128
4592
4593BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4594 PROLOGUE_4_ARGS
4595 IEMIMPL_AVX_PROLOGUE
4596 AVX_LD_XSAVEAREA_MXCSR A0
4597
4598 vmovdqu ymm0, [A2]
4599 vmovdqu ymm1, [A3]
4600 v %+ %1 ymm0, ymm0, ymm1
4601 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4602
4603 AVX256_ST_XSAVEAREA_MXCSR A1
4604 IEMIMPL_AVX_PROLOGUE
4605 EPILOGUE_4_ARGS
4606ENDPROC iemAImpl_v %+ %1 %+ _u256
4607 %elif %2 == 2
4608BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4609 PROLOGUE_4_ARGS
4610 IEMIMPL_AVX_PROLOGUE
4611 AVX_LD_XSAVEAREA_MXCSR A0
4612
4613 vmovdqu xmm0, [A2]
4614 vmovdqu xmm1, [A3]
4615 v %+ %1 xmm0, xmm1
4616 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4617
4618 AVX128_ST_XSAVEAREA_MXCSR A1
4619 IEMIMPL_AVX_PROLOGUE
4620 EPILOGUE_4_ARGS
4621ENDPROC iemAImpl_v %+ %1 %+ _u128
4622
4623BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4624 PROLOGUE_4_ARGS
4625 IEMIMPL_AVX_PROLOGUE
4626 AVX_LD_XSAVEAREA_MXCSR A0
4627
4628 vmovdqu ymm0, [A2]
4629 vmovdqu ymm1, [A3]
4630 v %+ %1 ymm0, ymm1
4631 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4632
4633 AVX256_ST_XSAVEAREA_MXCSR A1
4634 IEMIMPL_AVX_PROLOGUE
4635 EPILOGUE_4_ARGS
4636ENDPROC iemAImpl_v %+ %1 %+ _u256
4637 %endif
4638%endmacro
4639
4640IEMIMPL_FP_F2 addps, 3
4641IEMIMPL_FP_F2 addpd, 3
4642IEMIMPL_FP_F2 mulps, 3
4643IEMIMPL_FP_F2 mulpd, 3
4644IEMIMPL_FP_F2 subps, 3
4645IEMIMPL_FP_F2 subpd, 3
4646IEMIMPL_FP_F2 minps, 3
4647IEMIMPL_FP_F2 minpd, 3
4648IEMIMPL_FP_F2 divps, 3
4649IEMIMPL_FP_F2 divpd, 3
4650IEMIMPL_FP_F2 maxps, 3
4651IEMIMPL_FP_F2 maxpd, 3
4652IEMIMPL_FP_F2 haddps, 3
4653IEMIMPL_FP_F2 haddpd, 3
4654IEMIMPL_FP_F2 hsubps, 3
4655IEMIMPL_FP_F2 hsubpd, 3
4656IEMIMPL_FP_F2 addsubps, 3
4657IEMIMPL_FP_F2 addsubpd, 3
4658
4659
4660;;
4661; These are actually unary operations but to keep it simple
4662; we treat them as binary for now, so the output result is
4663; always in sync with the register where the result might get written
4664; to.
4665IEMIMPL_FP_F2 sqrtps, 2
4666IEMIMPL_FP_F2 sqrtpd, 2
4667IEMIMPL_FP_F2 cvtdq2ps, 2
4668IEMIMPL_FP_F2 cvtps2dq, 2
4669IEMIMPL_FP_F2 cvttps2dq, 2
4670IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4671IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4672IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4673
4674
4675;;
4676; Floating point instruction working on a full sized register and a single precision operand.
4677;
4678; @param 1 The instruction
4679;
4680; @param A0 FPU context (FXSTATE or XSAVEAREA).
4681; @param A1 Where to return the result including the MXCSR value.
4682; @param A2 Pointer to the first media register size operand (input/output).
4683; @param A3 Pointer to the second single precision floating point value (input).
4684;
4685%macro IEMIMPL_FP_F2_R32 1
4686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 12
4687 PROLOGUE_4_ARGS
4688 IEMIMPL_SSE_PROLOGUE
4689 SSE_LD_FXSTATE_MXCSR A0
4690
4691 movdqu xmm0, [A2]
4692 movd xmm1, [A3]
4693 %1 xmm0, xmm1
4694 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4695
4696 SSE_ST_FXSTATE_MXCSR A1, A0
4697 IEMIMPL_SSE_PROLOGUE
4698 EPILOGUE_4_ARGS
4699ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4700
4701BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 12
4702 PROLOGUE_4_ARGS
4703 IEMIMPL_AVX_PROLOGUE
4704 AVX_LD_XSAVEAREA_MXCSR A0
4705
4706 vmovdqu xmm0, [A2]
4707 vmovd xmm1, [A3]
4708 v %+ %1 xmm0, xmm0, xmm1
4709 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4710
4711 AVX128_ST_XSAVEAREA_MXCSR A1
4712 IEMIMPL_AVX_PROLOGUE
4713 EPILOGUE_4_ARGS
4714ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4715%endmacro
4716
4717IEMIMPL_FP_F2_R32 addss
4718IEMIMPL_FP_F2_R32 mulss
4719IEMIMPL_FP_F2_R32 subss
4720IEMIMPL_FP_F2_R32 minss
4721IEMIMPL_FP_F2_R32 divss
4722IEMIMPL_FP_F2_R32 maxss
4723IEMIMPL_FP_F2_R32 cvtss2sd
4724IEMIMPL_FP_F2_R32 sqrtss
4725
4726
4727;;
4728; Floating point instruction working on a full sized register and a double precision operand.
4729;
4730; @param 1 The instruction
4731;
4732; @param A0 FPU context (FXSTATE or XSAVEAREA).
4733; @param A1 Where to return the result including the MXCSR value.
4734; @param A2 Pointer to the first media register size operand (input/output).
4735; @param A3 Pointer to the second double precision floating point value (input).
4736;
4737%macro IEMIMPL_FP_F2_R64 1
4738BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 12
4739 PROLOGUE_4_ARGS
4740 IEMIMPL_SSE_PROLOGUE
4741 SSE_LD_FXSTATE_MXCSR A0
4742
4743 movdqu xmm0, [A2]
4744 movq xmm1, [A3]
4745 %1 xmm0, xmm1
4746 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4747
4748 SSE_ST_FXSTATE_MXCSR A1, A0
4749 IEMIMPL_SSE_PROLOGUE
4750 EPILOGUE_4_ARGS
4751ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4752
4753BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 12
4754 PROLOGUE_4_ARGS
4755 IEMIMPL_AVX_PROLOGUE
4756 AVX_LD_XSAVEAREA_MXCSR A0
4757
4758 vmovdqu xmm0, [A2]
4759 vmovq xmm1, [A3]
4760 v %+ %1 xmm0, xmm0, xmm1
4761 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4762
4763 AVX128_ST_XSAVEAREA_MXCSR A1
4764 IEMIMPL_AVX_PROLOGUE
4765 EPILOGUE_4_ARGS
4766ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4767%endmacro
4768
4769IEMIMPL_FP_F2_R64 addsd
4770IEMIMPL_FP_F2_R64 mulsd
4771IEMIMPL_FP_F2_R64 subsd
4772IEMIMPL_FP_F2_R64 minsd
4773IEMIMPL_FP_F2_R64 divsd
4774IEMIMPL_FP_F2_R64 maxsd
4775IEMIMPL_FP_F2_R64 cvtsd2ss
4776IEMIMPL_FP_F2_R64 sqrtsd
4777
4778
4779;;
4780; Macro for the cvtpd2ps/cvtps2pd instructions.
4781;
4782; 1 The instruction name.
4783; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4784;
4785; @param A0 FPU context (FXSTATE or XSAVEAREA).
4786; @param A1 Where to return the result including the MXCSR value.
4787; @param A2 Pointer to the first media register size operand (input/output).
4788; @param A3 Pointer to the second media register size operand (input).
4789;
4790%macro IEMIMPL_CVT_F2 2
4791BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4792 PROLOGUE_4_ARGS
4793 IEMIMPL_SSE_PROLOGUE
4794 SSE_LD_FXSTATE_MXCSR A0
4795
4796 movdqu xmm0, [A2]
4797 movdqu xmm1, [A3]
4798 %1 xmm0, xmm1
4799 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4800
4801 SSE_ST_FXSTATE_MXCSR A1, A0
4802 IEMIMPL_SSE_PROLOGUE
4803 EPILOGUE_4_ARGS
4804ENDPROC iemAImpl_ %+ %1 %+ _u128
4805
4806BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4807 PROLOGUE_4_ARGS
4808 IEMIMPL_AVX_PROLOGUE
4809 AVX_LD_XSAVEAREA_MXCSR A0
4810
4811 vmovdqu xmm0, [A2]
4812 vmovdqu xmm1, [A3]
4813 v %+ %1 xmm0, xmm1
4814 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4815
4816 AVX128_ST_XSAVEAREA_MXCSR A1
4817 IEMIMPL_AVX_PROLOGUE
4818 EPILOGUE_4_ARGS
4819ENDPROC iemAImpl_v %+ %1 %+ _u128
4820
4821BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4822 PROLOGUE_4_ARGS
4823 IEMIMPL_AVX_PROLOGUE
4824 AVX_LD_XSAVEAREA_MXCSR A0
4825
4826 vmovdqu ymm0, [A2]
4827 vmovdqu ymm1, [A3]
4828 %if %2 == 0
4829 v %+ %1 xmm0, ymm1
4830 %else
4831 v %+ %1 ymm0, xmm1
4832 %endif
4833 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4834
4835 AVX256_ST_XSAVEAREA_MXCSR A1
4836 IEMIMPL_AVX_PROLOGUE
4837 EPILOGUE_4_ARGS
4838ENDPROC iemAImpl_v %+ %1 %+ _u256
4839%endmacro
4840
4841IEMIMPL_CVT_F2 cvtpd2ps, 0
4842IEMIMPL_CVT_F2 cvtps2pd, 1
4843
4844
4845;;
4846; shufps instructions with 8-bit immediates.
4847;
4848; @param A0 Pointer to the destination media register size operand (input/output).
4849; @param A1 Pointer to the first source media register size operand (input).
4850; @param A2 The 8-bit immediate
4851;
4852BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4853 PROLOGUE_3_ARGS
4854 IEMIMPL_SSE_PROLOGUE
4855
4856 movdqu xmm0, [A0]
4857 movdqu xmm1, [A1]
4858 lea T1, [.imm0 xWrtRIP]
4859 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4860 lea T1, [T1 + T0*2]
4861 call T1
4862 movdqu [A0], xmm0
4863
4864 IEMIMPL_SSE_EPILOGUE
4865 EPILOGUE_3_ARGS
4866 %assign bImm 0
4867 %rep 256
4868.imm %+ bImm:
4869 shufps xmm0, xmm1, bImm
4870 ret
4871 int3
4872 %assign bImm bImm + 1
4873 %endrep
4874.immEnd: ; 256*6 == 0x600
4875dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4876dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4877ENDPROC iemAImpl_shufps_u128
4878
4879
4880;;
4881; shufpd instruction with 8-bit immediates.
4882;
4883; @param A0 Pointer to the destination media register size operand (input/output).
4884; @param A1 Pointer to the first source media register size operand (input).
4885; @param A2 The 8-bit immediate
4886;
4887BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4888 PROLOGUE_3_ARGS
4889 IEMIMPL_SSE_PROLOGUE
4890
4891 movdqu xmm0, [A0]
4892 movdqu xmm1, [A1]
4893 lea T1, [.imm0 xWrtRIP]
4894 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4895 lea T1, [T1 + T0*2]
4896 call T1
4897 movdqu [A0], xmm0
4898
4899 IEMIMPL_SSE_EPILOGUE
4900 EPILOGUE_3_ARGS
4901 %assign bImm 0
4902 %rep 256
4903.imm %+ bImm:
4904 shufpd xmm0, xmm1, bImm
4905 ret
4906 %assign bImm bImm + 1
4907 %endrep
4908.immEnd: ; 256*6 == 0x600
4909dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4910dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4911ENDPROC iemAImpl_shufpd_u128
4912
4913
4914;;
4915; vshufp{s,d} instructions with 8-bit immediates.
4916;
4917; @param 1 The instruction name.
4918;
4919; @param A0 Pointer to the destination media register size operand (output).
4920; @param A1 Pointer to the first source media register size operand (input).
4921; @param A2 Pointer to the second source media register size operand (input).
4922; @param A3 The 8-bit immediate
4923;
4924%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4925BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4926 PROLOGUE_4_ARGS
4927 IEMIMPL_AVX_PROLOGUE
4928
4929 movdqu xmm0, [A1]
4930 movdqu xmm1, [A2]
4931 lea T1, [.imm0 xWrtRIP]
4932 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4933 lea T1, [T1 + T0*2]
4934 call T1
4935 movdqu [A0], xmm0
4936
4937 IEMIMPL_AVX_EPILOGUE
4938 EPILOGUE_4_ARGS
4939 %assign bImm 0
4940 %rep 256
4941.imm %+ bImm:
4942 %1 xmm0, xmm0, xmm1, bImm
4943 ret
4944 %assign bImm bImm + 1
4945 %endrep
4946.immEnd: ; 256*6 == 0x600
4947dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4948dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4949ENDPROC iemAImpl_ %+ %1 %+ _u128
4950
4951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4952 PROLOGUE_4_ARGS
4953 IEMIMPL_AVX_PROLOGUE
4954
4955 vmovdqu ymm0, [A1]
4956 vmovdqu ymm1, [A2]
4957 lea T1, [.imm0 xWrtRIP]
4958 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4959 lea T1, [T1 + T0*2]
4960 call T1
4961 vmovdqu [A0], ymm0
4962
4963 IEMIMPL_AVX_EPILOGUE
4964 EPILOGUE_4_ARGS
4965 %assign bImm 0
4966 %rep 256
4967.imm %+ bImm:
4968 %1 ymm0, ymm0, ymm1, bImm
4969 ret
4970 %assign bImm bImm + 1
4971 %endrep
4972.immEnd: ; 256*6 == 0x600
4973dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4974dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4975ENDPROC iemAImpl_ %+ %1 %+ _u256
4976%endmacro
4977
4978IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
4979IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
4980
4981
4982;;
4983; One of the [p]blendv{b,ps,pd} variants
4984;
4985; @param 1 The instruction
4986;
4987; @param A0 Pointer to the first media register sized operand (input/output).
4988; @param A1 Pointer to the second media sized value (input).
4989; @param A2 Pointer to the media register sized mask value (input).
4990;
4991%macro IEMIMPL_P_BLEND 1
4992BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4993 PROLOGUE_3_ARGS
4994 IEMIMPL_SSE_PROLOGUE
4995
4996 movdqu xmm0, [A2] ; This is implicit
4997 movdqu xmm1, [A0]
4998 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
4999 %1 xmm1, xmm2
5000 movdqu [A0], xmm1
5001
5002 IEMIMPL_SSE_PROLOGUE
5003 EPILOGUE_3_ARGS
5004ENDPROC iemAImpl_ %+ %1 %+ _u128
5005%endmacro
5006
5007IEMIMPL_P_BLEND pblendvb
5008IEMIMPL_P_BLEND blendvps
5009IEMIMPL_P_BLEND blendvpd
5010
5011
5012;;
5013; One of the v[p]blendv{b,ps,pd} variants
5014;
5015; @param 1 The instruction
5016;
5017; @param A0 Pointer to the first media register sized operand (output).
5018; @param A1 Pointer to the first media register sized operand (input).
5019; @param A2 Pointer to the second media register sized operand (input).
5020; @param A3 Pointer to the media register sized mask value (input).
5021%macro IEMIMPL_AVX_P_BLEND 1
5022BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5023 PROLOGUE_4_ARGS
5024 IEMIMPL_AVX_PROLOGUE
5025
5026 vmovdqu xmm0, [A1]
5027 vmovdqu xmm1, [A2]
5028 vmovdqu xmm2, [A3]
5029 %1 xmm0, xmm0, xmm1, xmm2
5030 vmovdqu [A0], xmm0
5031
5032 IEMIMPL_AVX_PROLOGUE
5033 EPILOGUE_4_ARGS
5034ENDPROC iemAImpl_ %+ %1 %+ _u128
5035
5036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5037 PROLOGUE_4_ARGS
5038 IEMIMPL_AVX_PROLOGUE
5039
5040 vmovdqu ymm0, [A1]
5041 vmovdqu ymm1, [A2]
5042 vmovdqu ymm2, [A3]
5043 %1 ymm0, ymm0, ymm1, ymm2
5044 vmovdqu [A0], ymm0
5045
5046 IEMIMPL_AVX_PROLOGUE
5047 EPILOGUE_4_ARGS
5048ENDPROC iemAImpl_ %+ %1 %+ _u256
5049%endmacro
5050
5051IEMIMPL_AVX_P_BLEND vpblendvb
5052IEMIMPL_AVX_P_BLEND vblendvps
5053IEMIMPL_AVX_P_BLEND vblendvpd
5054
5055
5056;;
5057; palignr mm1, mm2/m64 instruction.
5058;
5059; @param A0 Pointer to the first media register sized operand (output).
5060; @param A1 The second register sized operand (input).
5061; @param A2 The 8-bit immediate.
5062BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5063 PROLOGUE_3_ARGS
5064 IEMIMPL_MMX_PROLOGUE
5065
5066 movq mm0, [A0]
5067 movq mm1, A1
5068 lea T1, [.imm0 xWrtRIP]
5069 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5070 lea T1, [T1 + T0*2]
5071 call T1
5072 movq [A0], mm0
5073
5074 IEMIMPL_MMX_EPILOGUE
5075 EPILOGUE_3_ARGS
5076 %assign bImm 0
5077 %rep 256
5078.imm %+ bImm:
5079 palignr mm0, mm1, bImm
5080 ret
5081 %assign bImm bImm + 1
5082 %endrep
5083.immEnd: ; 256*6 == 0x600
5084dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5085dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5086ENDPROC iemAImpl_palignr_u64
5087
5088
5089;;
5090; SSE instructions with 8-bit immediates of the form
5091; xxx xmm1, xmm2, imm8.
5092; where the instruction encoding takes up 6 bytes.
5093;
5094; @param 1 The instruction name.
5095;
5096; @param A0 Pointer to the first media register size operand (input/output).
5097; @param A1 Pointer to the second source media register size operand (input).
5098; @param A2 The 8-bit immediate
5099;
5100%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5102 PROLOGUE_3_ARGS
5103 IEMIMPL_SSE_PROLOGUE
5104
5105 movdqu xmm0, [A0]
5106 movdqu xmm1, [A1]
5107 lea T1, [.imm0 xWrtRIP]
5108 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5109 lea T1, [T1 + T0*2]
5110 call T1
5111 movdqu [A0], xmm0
5112
5113 IEMIMPL_SSE_EPILOGUE
5114 EPILOGUE_3_ARGS
5115 %assign bImm 0
5116 %rep 256
5117.imm %+ bImm:
5118 %1 xmm0, xmm1, bImm
5119 ret
5120 int3
5121 %assign bImm bImm + 1
5122 %endrep
5123.immEnd: ; 256*8 == 0x800
5124dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5125dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5126ENDPROC iemAImpl_ %+ %1 %+ _u128
5127%endmacro
5128
5129IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5130IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5131IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5132IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5133IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5134IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5135
5136
5137;;
5138; AVX instructions with 8-bit immediates of the form
5139; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5140; where the instruction encoding takes up 6 bytes.
5141;
5142; @param 1 The instruction name.
5143; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5144;
5145; @param A0 Pointer to the destination media register size operand (output).
5146; @param A1 Pointer to the first source media register size operand (input).
5147; @param A2 Pointer to the second source media register size operand (input).
5148; @param A3 The 8-bit immediate
5149;
5150%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5151BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5152 PROLOGUE_4_ARGS
5153 IEMIMPL_AVX_PROLOGUE
5154
5155 movdqu xmm0, [A1]
5156 movdqu xmm1, [A2]
5157 lea T1, [.imm0 xWrtRIP]
5158 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5159 lea T1, [T1 + T0*2]
5160 call T1
5161 movdqu [A0], xmm0
5162
5163 IEMIMPL_AVX_EPILOGUE
5164 EPILOGUE_4_ARGS
5165 %assign bImm 0
5166 %rep 256
5167.imm %+ bImm:
5168 %1 xmm0, xmm0, xmm1, bImm
5169 ret
5170 int3
5171 %assign bImm bImm + 1
5172 %endrep
5173.immEnd: ; 256*8 == 0x800
5174dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5175dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5176ENDPROC iemAImpl_ %+ %1 %+ _u128
5177
5178 %if %2 == 1
5179BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5180 PROLOGUE_4_ARGS
5181 IEMIMPL_AVX_PROLOGUE
5182
5183 vmovdqu ymm0, [A1]
5184 vmovdqu ymm1, [A2]
5185 lea T1, [.imm0 xWrtRIP]
5186 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5187 lea T1, [T1 + T0*2]
5188 call T1
5189 vmovdqu [A0], ymm0
5190
5191 IEMIMPL_AVX_EPILOGUE
5192 EPILOGUE_4_ARGS
5193 %assign bImm 0
5194 %rep 256
5195.imm %+ bImm:
5196 %1 ymm0, ymm0, ymm1, bImm
5197 ret
5198 int3
5199 %assign bImm bImm + 1
5200 %endrep
5201.immEnd: ; 256*8 == 0x800
5202dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5203dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5204ENDPROC iemAImpl_ %+ %1 %+ _u256
5205 %endif
5206%endmacro
5207
5208IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5209IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5210IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5211IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5212IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5213
5214
5215;;
5216; Need to move this as well somewhere better?
5217;
5218struc IEMPCMPISTRISRC
5219 .uSrc1 resd 4
5220 .uSrc2 resd 4
5221endstruc
5222
5223;;
5224; The pcmpistri instruction.
5225;
5226; @param A0 Pointer to the ECX register to store the result to (output).
5227; @param A1 Pointer to the EFLAGS register.
5228; @param A2 Pointer to the structure containing the source operands (input).
5229; @param A3 The 8-bit immediate
5230;
5231BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5232 PROLOGUE_4_ARGS
5233 IEMIMPL_SSE_PROLOGUE
5234
5235 movdqu xmm0, [A2 + IEMPCMPISTRISRC.uSrc1]
5236 movdqu xmm1, [A2 + IEMPCMPISTRISRC.uSrc2]
5237 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5238 lea T1, [.imm0 xWrtRIP]
5239 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5240 lea T1, [T1 + T0*2]
5241 call T1
5242
5243 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5244 mov [T2], ecx
5245
5246 IEMIMPL_SSE_EPILOGUE
5247 EPILOGUE_4_ARGS
5248 %assign bImm 0
5249 %rep 256
5250.imm %+ bImm:
5251 pcmpistri xmm0, xmm1, bImm
5252 ret
5253 int3
5254 %assign bImm bImm + 1
5255 %endrep
5256.immEnd: ; 256*8 == 0x800
5257dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5258dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5259ENDPROC iemAImpl_pcmpistri_u128
5260
5261
5262;;
5263; pinsrw instruction.
5264;
5265; @param A0 Pointer to the first media register size operand (input/output).
5266; @param A1 The 16 bit input operand (input).
5267; @param A2 The 8-bit immediate
5268;
5269BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5270 PROLOGUE_3_ARGS
5271 IEMIMPL_SSE_PROLOGUE
5272
5273 movq mm0, [A0]
5274 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5275 lea T1, [.imm0 xWrtRIP]
5276 lea T1, [T1 + T0]
5277 call T1
5278 movq [A0], mm0
5279
5280 IEMIMPL_SSE_EPILOGUE
5281 EPILOGUE_3_ARGS
5282 %assign bImm 0
5283 %rep 256
5284.imm %+ bImm:
5285 pinsrw mm0, A1_32, bImm
5286 ret
5287 %assign bImm bImm + 1
5288 %endrep
5289.immEnd: ; 256*5 == 0x500
5290dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5291dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5292ENDPROC iemAImpl_pinsrw_u64
5293
5294BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5295 PROLOGUE_3_ARGS
5296 IEMIMPL_SSE_PROLOGUE
5297
5298 movdqu xmm0, [A0]
5299 lea T1, [.imm0 xWrtRIP]
5300 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5301 lea T1, [T1 + T0*2]
5302 call T1
5303 movdqu [A0], xmm0
5304
5305 IEMIMPL_SSE_EPILOGUE
5306 EPILOGUE_3_ARGS
5307 %assign bImm 0
5308 %rep 256
5309.imm %+ bImm:
5310 pinsrw xmm0, A1_32, bImm
5311 ret
5312 %assign bImm bImm + 1
5313 %endrep
5314.immEnd: ; 256*6 == 0x600
5315dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5316dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5317ENDPROC iemAImpl_pinsrw_u128
5318
5319;;
5320; vpinsrw instruction.
5321;
5322; @param A0 Pointer to the first media register size operand (output).
5323; @param A1 Pointer to the source media register size operand (input).
5324; @param A2 The 16 bit input operand (input).
5325; @param A3 The 8-bit immediate
5326;
5327BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5328 PROLOGUE_4_ARGS
5329 IEMIMPL_SSE_PROLOGUE
5330
5331 movdqu xmm0, [A1]
5332 lea T1, [.imm0 xWrtRIP]
5333 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5334 lea T1, [T1 + T0*2]
5335 call T1
5336 movdqu [A0], xmm0
5337
5338 IEMIMPL_SSE_EPILOGUE
5339 EPILOGUE_4_ARGS
5340 %assign bImm 0
5341 %rep 256
5342.imm %+ bImm:
5343 vpinsrw xmm0, xmm0, A2_32, bImm
5344 ret
5345 %assign bImm bImm + 1
5346 %endrep
5347.immEnd: ; 256*6 == 0x600
5348dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5349dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5350ENDPROC iemAImpl_vpinsrw_u128
5351
5352
5353;;
5354; pextrw instruction.
5355;
5356; @param A0 Pointer to the 16bit output operand (output).
5357; @param A1 Pointer to the media register size operand (input).
5358; @param A2 The 8-bit immediate
5359;
5360BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5361 PROLOGUE_3_ARGS
5362 IEMIMPL_SSE_PROLOGUE
5363
5364 movq mm0, A1
5365 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5366 lea T1, [.imm0 xWrtRIP]
5367 lea T1, [T1 + T0]
5368 call T1
5369 mov word [A0], T0_16
5370
5371 IEMIMPL_SSE_EPILOGUE
5372 EPILOGUE_3_ARGS
5373 %assign bImm 0
5374 %rep 256
5375.imm %+ bImm:
5376 pextrw T0_32, mm0, bImm
5377 ret
5378 %assign bImm bImm + 1
5379 %endrep
5380.immEnd: ; 256*5 == 0x500
5381dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5382dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5383ENDPROC iemAImpl_pextrw_u64
5384
5385BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5386 PROLOGUE_3_ARGS
5387 IEMIMPL_SSE_PROLOGUE
5388
5389 movdqu xmm0, [A1]
5390 lea T1, [.imm0 xWrtRIP]
5391 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5392 lea T1, [T1 + T0*2]
5393 call T1
5394 mov word [A0], T0_16
5395
5396 IEMIMPL_SSE_EPILOGUE
5397 EPILOGUE_3_ARGS
5398 %assign bImm 0
5399 %rep 256
5400.imm %+ bImm:
5401 pextrw T0_32, xmm0, bImm
5402 ret
5403 %assign bImm bImm + 1
5404 %endrep
5405.immEnd: ; 256*6 == 0x600
5406dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5407dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5408ENDPROC iemAImpl_pextrw_u128
5409
5410;;
5411; vpextrw instruction.
5412;
5413; @param A0 Pointer to the 16bit output operand (output).
5414; @param A1 Pointer to the source media register size operand (input).
5415; @param A2 The 8-bit immediate
5416;
5417BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5418 PROLOGUE_3_ARGS
5419 IEMIMPL_SSE_PROLOGUE
5420
5421 movdqu xmm0, [A1]
5422 lea T1, [.imm0 xWrtRIP]
5423 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5424 lea T1, [T1 + T0*2]
5425 call T1
5426 mov word [A0], T0_16
5427
5428 IEMIMPL_SSE_EPILOGUE
5429 EPILOGUE_3_ARGS
5430 %assign bImm 0
5431 %rep 256
5432.imm %+ bImm:
5433 vpextrw T0_32, xmm0, bImm
5434 ret
5435 %assign bImm bImm + 1
5436 %endrep
5437.immEnd: ; 256*6 == 0x600
5438dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5439dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5440ENDPROC iemAImpl_vpextrw_u128
5441
5442
5443;;
5444; movmskp{s,d} SSE instruction template
5445;
5446; @param 1 The SSE instruction name.
5447; @param 2 The AVX instruction name.
5448;
5449; @param A0 Pointer to the output register (output/byte sized).
5450; @param A1 Pointer to the source media register size operand (input).
5451;
5452%macro IEMIMPL_MEDIA_MOVMSK_P 2
5453BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5454 PROLOGUE_2_ARGS
5455 IEMIMPL_SSE_PROLOGUE
5456
5457 movdqu xmm0, [A1]
5458 %1 T0, xmm0
5459 mov byte [A0], T0_8
5460
5461 IEMIMPL_SSE_EPILOGUE
5462 EPILOGUE_2_ARGS
5463ENDPROC iemAImpl_ %+ %1 %+ _u128
5464
5465BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5466 PROLOGUE_2_ARGS
5467 IEMIMPL_AVX_PROLOGUE
5468
5469 movdqu xmm0, [A1]
5470 %2 T0, xmm0
5471 mov byte [A0], T0_8
5472
5473 IEMIMPL_AVX_EPILOGUE
5474 EPILOGUE_2_ARGS
5475ENDPROC iemAImpl_ %+ %2 %+ _u128
5476
5477BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5478 PROLOGUE_2_ARGS
5479 IEMIMPL_AVX_PROLOGUE
5480
5481 vmovdqu ymm0, [A1]
5482 %2 T0, ymm0
5483 mov byte [A0], T0_8
5484
5485 IEMIMPL_AVX_EPILOGUE
5486 EPILOGUE_2_ARGS
5487ENDPROC iemAImpl_ %+ %2 %+ _u256
5488%endmacro
5489
5490IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5491IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5492
5493
5494;;
5495; Restores the SSE MXCSR register with the original value.
5496;
5497; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5498; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5499; @param 2 Expression giving the address of the FXSTATE of the guest.
5500;
5501; @note Restores the stack pointer.
5502;
5503%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5504 sub xSP, 4
5505 stmxcsr [xSP]
5506 mov T0_32, [xSP]
5507 add xSP, 4
5508 ; Merge the status bits into the original MXCSR value.
5509 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5510 and T0_32, X86_MXCSR_XCPT_FLAGS
5511 or T0_32, T1_32
5512 mov [%1], T0_32
5513
5514 ldmxcsr [xSP]
5515 add xSP, 4
5516%endmacro
5517
5518
5519;;
5520; cvttsd2si instruction - 32-bit variant.
5521;
5522; @param A0 FPU context (FXSTATE or XSAVEAREA).
5523; @param A1 Where to return the MXCSR value.
5524; @param A2 Pointer to the result operand (output).
5525; @param A3 Pointer to the second operand (input).
5526;
5527BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5528 PROLOGUE_4_ARGS
5529 IEMIMPL_SSE_PROLOGUE
5530 SSE_LD_FXSTATE_MXCSR A0
5531
5532 cvttsd2si T0_32, [A3]
5533 mov dword [A2], T0_32
5534
5535 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5536 IEMIMPL_SSE_EPILOGUE
5537 EPILOGUE_4_ARGS
5538ENDPROC iemAImpl_cvttsd2si_i32_r64
5539
5540;;
5541; cvttsd2si instruction - 64-bit variant.
5542;
5543; @param A0 FPU context (FXSTATE or XSAVEAREA).
5544; @param A1 Where to return the MXCSR value.
5545; @param A2 Pointer to the result operand (output).
5546; @param A3 Pointer to the second operand (input).
5547;
5548BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5549 PROLOGUE_4_ARGS
5550 IEMIMPL_SSE_PROLOGUE
5551 SSE_LD_FXSTATE_MXCSR A0
5552
5553 cvttsd2si T0, [A3]
5554 mov qword [A2], T0
5555
5556 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5557 IEMIMPL_SSE_EPILOGUE
5558 EPILOGUE_4_ARGS
5559ENDPROC iemAImpl_cvttsd2si_i64_r64
5560
5561
5562;;
5563; cvtsd2si instruction - 32-bit variant.
5564;
5565; @param A0 FPU context (FXSTATE or XSAVEAREA).
5566; @param A1 Where to return the MXCSR value.
5567; @param A2 Pointer to the result operand (output).
5568; @param A3 Pointer to the second operand (input).
5569;
5570BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5571 PROLOGUE_4_ARGS
5572 IEMIMPL_SSE_PROLOGUE
5573 SSE_LD_FXSTATE_MXCSR A0
5574
5575 cvtsd2si T0_32, [A3]
5576 mov dword [A2], T0_32
5577
5578 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5579 IEMIMPL_SSE_EPILOGUE
5580 EPILOGUE_4_ARGS
5581ENDPROC iemAImpl_cvtsd2si_i32_r64
5582
5583;;
5584; cvtsd2si instruction - 64-bit variant.
5585;
5586; @param A0 FPU context (FXSTATE or XSAVEAREA).
5587; @param A1 Where to return the MXCSR value.
5588; @param A2 Pointer to the result operand (output).
5589; @param A3 Pointer to the second operand (input).
5590;
5591BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5592 PROLOGUE_4_ARGS
5593 IEMIMPL_SSE_PROLOGUE
5594 SSE_LD_FXSTATE_MXCSR A0
5595
5596 cvtsd2si T0, [A3]
5597 mov qword [A2], T0
5598
5599 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5600 IEMIMPL_SSE_EPILOGUE
5601 EPILOGUE_4_ARGS
5602ENDPROC iemAImpl_cvtsd2si_i64_r64
5603
5604
5605;;
5606; cvttss2si instruction - 32-bit variant.
5607;
5608; @param A0 FPU context (FXSTATE or XSAVEAREA).
5609; @param A1 Where to return the MXCSR value.
5610; @param A2 Pointer to the result operand (output).
5611; @param A3 Pointer to the second operand (input).
5612;
5613BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5614 PROLOGUE_4_ARGS
5615 IEMIMPL_SSE_PROLOGUE
5616 SSE_LD_FXSTATE_MXCSR A0
5617
5618 cvttss2si T0_32, [A3]
5619 mov dword [A2], T0_32
5620
5621 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5622 IEMIMPL_SSE_EPILOGUE
5623 EPILOGUE_4_ARGS
5624ENDPROC iemAImpl_cvttss2si_i32_r32
5625
5626;;
5627; cvttss2si instruction - 64-bit variant.
5628;
5629; @param A0 FPU context (FXSTATE or XSAVEAREA).
5630; @param A1 Where to return the MXCSR value.
5631; @param A2 Pointer to the result operand (output).
5632; @param A3 Pointer to the second operand (input).
5633;
5634BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5635 PROLOGUE_4_ARGS
5636 IEMIMPL_SSE_PROLOGUE
5637 SSE_LD_FXSTATE_MXCSR A0
5638
5639 cvttss2si T0, [A3]
5640 mov qword [A2], T0
5641
5642 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5643 IEMIMPL_SSE_EPILOGUE
5644 EPILOGUE_4_ARGS
5645ENDPROC iemAImpl_cvttss2si_i64_r32
5646
5647
5648;;
5649; cvtss2si instruction - 32-bit variant.
5650;
5651; @param A0 FPU context (FXSTATE or XSAVEAREA).
5652; @param A1 Where to return the MXCSR value.
5653; @param A2 Pointer to the result operand (output).
5654; @param A3 Pointer to the second operand (input).
5655;
5656BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5657 PROLOGUE_4_ARGS
5658 IEMIMPL_SSE_PROLOGUE
5659 SSE_LD_FXSTATE_MXCSR A0
5660
5661 cvtss2si T0_32, [A3]
5662 mov dword [A2], T0_32
5663
5664 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5665 IEMIMPL_SSE_EPILOGUE
5666 EPILOGUE_4_ARGS
5667ENDPROC iemAImpl_cvtss2si_i32_r32
5668
5669;;
5670; cvtss2si instruction - 64-bit variant.
5671;
5672; @param A0 FPU context (FXSTATE or XSAVEAREA).
5673; @param A1 Where to return the MXCSR value.
5674; @param A2 Pointer to the result operand (output).
5675; @param A3 Pointer to the second operand (input).
5676;
5677BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5678 PROLOGUE_4_ARGS
5679 IEMIMPL_SSE_PROLOGUE
5680 SSE_LD_FXSTATE_MXCSR A0
5681
5682 cvtss2si T0, [A3]
5683 mov qword [A2], T0
5684
5685 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5686 IEMIMPL_SSE_EPILOGUE
5687 EPILOGUE_4_ARGS
5688ENDPROC iemAImpl_cvtss2si_i64_r32
5689
5690
5691;;
5692; cvtsi2ss instruction - 32-bit variant.
5693;
5694; @param A0 FPU context (FXSTATE or XSAVEAREA).
5695; @param A1 Where to return the MXCSR value.
5696; @param A2 Pointer to the result operand (output).
5697; @param A3 Pointer to the second operand (input).
5698;
5699BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5700 PROLOGUE_4_ARGS
5701 IEMIMPL_SSE_PROLOGUE
5702 SSE_LD_FXSTATE_MXCSR A0
5703
5704 cvtsi2ss xmm0, dword [A3]
5705 movd dword [A2], xmm0
5706
5707 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5708 IEMIMPL_SSE_EPILOGUE
5709 EPILOGUE_4_ARGS
5710ENDPROC iemAImpl_cvtsi2ss_r32_i32
5711
5712;;
5713; cvtsi2ss instruction - 64-bit variant.
5714;
5715; @param A0 FPU context (FXSTATE or XSAVEAREA).
5716; @param A1 Where to return the MXCSR value.
5717; @param A2 Pointer to the result operand (output).
5718; @param A3 Pointer to the second operand (input).
5719;
5720BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5721 PROLOGUE_4_ARGS
5722 IEMIMPL_SSE_PROLOGUE
5723 SSE_LD_FXSTATE_MXCSR A0
5724
5725 cvtsi2ss xmm0, qword [A3]
5726 movd dword [A2], xmm0
5727
5728 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5729 IEMIMPL_SSE_EPILOGUE
5730 EPILOGUE_4_ARGS
5731ENDPROC iemAImpl_cvtsi2ss_r32_i64
5732
5733
5734;;
5735; cvtsi2sd instruction - 32-bit variant.
5736;
5737; @param A0 FPU context (FXSTATE or XSAVEAREA).
5738; @param A1 Where to return the MXCSR value.
5739; @param A2 Pointer to the result operand (output).
5740; @param A3 Pointer to the second operand (input).
5741;
5742BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5743 PROLOGUE_4_ARGS
5744 IEMIMPL_SSE_PROLOGUE
5745 SSE_LD_FXSTATE_MXCSR A0
5746
5747 cvtsi2sd xmm0, dword [A3]
5748 movq [A2], xmm0
5749
5750 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5751 IEMIMPL_SSE_EPILOGUE
5752 EPILOGUE_4_ARGS
5753ENDPROC iemAImpl_cvtsi2sd_r64_i32
5754
5755;;
5756; cvtsi2sd instruction - 64-bit variant.
5757;
5758; @param A0 FPU context (FXSTATE or XSAVEAREA).
5759; @param A1 Where to return the MXCSR value.
5760; @param A2 Pointer to the result operand (output).
5761; @param A3 Pointer to the second operand (input).
5762;
5763BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5764 PROLOGUE_4_ARGS
5765 IEMIMPL_SSE_PROLOGUE
5766 SSE_LD_FXSTATE_MXCSR A0
5767
5768 cvtsi2sd xmm0, qword [A3]
5769 movq [A2], xmm0
5770
5771 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5772 IEMIMPL_SSE_EPILOGUE
5773 EPILOGUE_4_ARGS
5774ENDPROC iemAImpl_cvtsi2sd_r64_i64
5775
5776
5777;;
5778; Initialize the SSE MXCSR register using the guest value partially to
5779; account for rounding mode.
5780;
5781; @uses 4 bytes of stack to save the original value, T0.
5782; @param 1 Expression giving the address of the MXCSR register of the guest.
5783;
5784%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5785 sub xSP, 4
5786
5787 stmxcsr [xSP]
5788 mov T0_32, [%1]
5789 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5790 or T0_32, X86_MXCSR_XCPT_MASK
5791 sub xSP, 4
5792 mov [xSP], T0_32
5793 ldmxcsr [xSP]
5794 add xSP, 4
5795%endmacro
5796
5797
5798;;
5799; Restores the SSE MXCSR register with the original value.
5800;
5801; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5802; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5803;
5804; @note Restores the stack pointer.
5805;
5806%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5807 sub xSP, 4
5808 stmxcsr [xSP]
5809 mov T0_32, [xSP]
5810 add xSP, 4
5811 ; Merge the status bits into the original MXCSR value.
5812 mov T1_32, [%1]
5813 and T0_32, X86_MXCSR_XCPT_FLAGS
5814 or T0_32, T1_32
5815 mov [%1], T0_32
5816
5817 ldmxcsr [xSP]
5818 add xSP, 4
5819%endmacro
5820
5821
5822;
5823; UCOMISS (SSE)
5824;
5825; @param A0 Pointer to the MXCSR value (input/output).
5826; @param A1 Pointer to the EFLAGS value (input/output).
5827; @param A2 Pointer to the first source operand (aka readonly destination).
5828; @param A3 Pointer to the second source operand.
5829;
5830BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5831 PROLOGUE_4_ARGS
5832 IEMIMPL_SSE_PROLOGUE
5833 SSE_LD_FXSTATE_MXCSR_ONLY A0
5834
5835 movdqu xmm0, [A2]
5836 movdqu xmm1, [A3]
5837 ucomiss xmm0, xmm1
5838 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5839
5840 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5841 IEMIMPL_SSE_EPILOGUE
5842 EPILOGUE_4_ARGS
5843ENDPROC iemAImpl_ucomiss_u128
5844
5845BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5846 PROLOGUE_4_ARGS
5847 IEMIMPL_SSE_PROLOGUE
5848 SSE_LD_FXSTATE_MXCSR_ONLY A0
5849
5850 movdqu xmm0, [A2]
5851 movdqu xmm1, [A3]
5852 vucomiss xmm0, xmm1
5853 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5854
5855 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5856 IEMIMPL_SSE_EPILOGUE
5857 EPILOGUE_4_ARGS
5858ENDPROC iemAImpl_vucomiss_u128
5859
5860
5861;
5862; UCOMISD (SSE)
5863;
5864; @param A0 Pointer to the MXCSR value (input/output).
5865; @param A1 Pointer to the EFLAGS value (input/output).
5866; @param A2 Pointer to the first source operand (aka readonly destination).
5867; @param A3 Pointer to the second source operand.
5868;
5869BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
5870 PROLOGUE_4_ARGS
5871 IEMIMPL_SSE_PROLOGUE
5872 SSE_LD_FXSTATE_MXCSR_ONLY A0
5873
5874 movdqu xmm0, [A2]
5875 movdqu xmm1, [A3]
5876 ucomisd xmm0, xmm1
5877 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5878
5879 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5880 IEMIMPL_SSE_EPILOGUE
5881 EPILOGUE_4_ARGS
5882ENDPROC iemAImpl_ucomisd_u128
5883
5884BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
5885 PROLOGUE_4_ARGS
5886 IEMIMPL_SSE_PROLOGUE
5887 SSE_LD_FXSTATE_MXCSR_ONLY A0
5888
5889 movdqu xmm0, [A2]
5890 movdqu xmm1, [A3]
5891 vucomisd xmm0, xmm1
5892 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5893
5894 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5895 IEMIMPL_SSE_EPILOGUE
5896 EPILOGUE_4_ARGS
5897ENDPROC iemAImpl_vucomisd_u128
5898
5899;
5900; COMISS (SSE)
5901;
5902; @param A0 Pointer to the MXCSR value (input/output).
5903; @param A1 Pointer to the EFLAGS value (input/output).
5904; @param A2 Pointer to the first source operand (aka readonly destination).
5905; @param A3 Pointer to the second source operand.
5906;
5907BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
5908 PROLOGUE_4_ARGS
5909 IEMIMPL_SSE_PROLOGUE
5910 SSE_LD_FXSTATE_MXCSR_ONLY A0
5911
5912 movdqu xmm0, [A2]
5913 movdqu xmm1, [A3]
5914 comiss xmm0, xmm1
5915 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5916
5917 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5918 IEMIMPL_SSE_EPILOGUE
5919 EPILOGUE_4_ARGS
5920ENDPROC iemAImpl_comiss_u128
5921
5922BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
5923 PROLOGUE_4_ARGS
5924 IEMIMPL_SSE_PROLOGUE
5925 SSE_LD_FXSTATE_MXCSR_ONLY A0
5926
5927 movdqu xmm0, [A2]
5928 movdqu xmm1, [A3]
5929 vcomiss xmm0, xmm1
5930 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5931
5932 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5933 IEMIMPL_SSE_EPILOGUE
5934 EPILOGUE_4_ARGS
5935ENDPROC iemAImpl_vcomiss_u128
5936
5937
5938;
5939; COMISD (SSE)
5940;
5941; @param A0 Pointer to the MXCSR value (input/output).
5942; @param A1 Pointer to the EFLAGS value (input/output).
5943; @param A2 Pointer to the first source operand (aka readonly destination).
5944; @param A3 Pointer to the second source operand.
5945;
5946BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
5947 PROLOGUE_4_ARGS
5948 IEMIMPL_SSE_PROLOGUE
5949 SSE_LD_FXSTATE_MXCSR_ONLY A0
5950
5951 movdqu xmm0, [A2]
5952 movdqu xmm1, [A3]
5953 comisd xmm0, xmm1
5954 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5955
5956 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5957 IEMIMPL_SSE_EPILOGUE
5958 EPILOGUE_4_ARGS
5959ENDPROC iemAImpl_comisd_u128
5960
5961BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
5962 PROLOGUE_4_ARGS
5963 IEMIMPL_SSE_PROLOGUE
5964 SSE_LD_FXSTATE_MXCSR_ONLY A0
5965
5966 movdqu xmm0, [A2]
5967 movdqu xmm1, [A3]
5968 vcomisd xmm0, xmm1
5969 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5970
5971 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5972 IEMIMPL_SSE_EPILOGUE
5973 EPILOGUE_4_ARGS
5974ENDPROC iemAImpl_vcomisd_u128
5975
5976
5977;;
5978; Need to move this as well somewhere better?
5979;
5980struc IEMMEDIAF2XMMSRC
5981 .uSrc1 resd 4
5982 .uSrc2 resd 4
5983endstruc
5984
5985
5986;
5987; CMPPS (SSE)
5988;
5989; @param A0 Pointer to the MXCSR value (input/output).
5990; @param A1 Pointer to the first media register size operand (output).
5991; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
5992; @param A3 The 8-bit immediate (input).
5993;
5994BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
5995 PROLOGUE_4_ARGS
5996 IEMIMPL_SSE_PROLOGUE
5997 SSE_LD_FXSTATE_MXCSR_ONLY A0
5998
5999 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6000 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6001 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6002 lea T1, [.imm0 xWrtRIP]
6003 lea T1, [T1 + T0]
6004 call T1
6005 movdqu [A1], xmm0
6006
6007 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6008 IEMIMPL_SSE_EPILOGUE
6009 EPILOGUE_4_ARGS
6010 %assign bImm 0
6011 %rep 256
6012.imm %+ bImm:
6013 cmpps xmm0, xmm1, bImm
6014 ret
6015 %assign bImm bImm + 1
6016 %endrep
6017.immEnd: ; 256*5 == 0x500
6018dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6019dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6020ENDPROC iemAImpl_cmpps_u128
6021
6022;;
6023; SSE instructions with 8-bit immediates of the form
6024; xxx xmm1, xmm2, imm8.
6025; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6026; register.
6027;
6028; @param 1 The instruction name.
6029;
6030; @param A0 Pointer to the MXCSR value (input/output).
6031; @param A1 Pointer to the first media register size operand (output).
6032; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6033; @param A3 The 8-bit immediate (input).
6034;
6035%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6037 PROLOGUE_4_ARGS
6038 IEMIMPL_SSE_PROLOGUE
6039 SSE_LD_FXSTATE_MXCSR_ONLY A0
6040
6041 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6042 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6043 lea T1, [.imm0 xWrtRIP]
6044 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6045 lea T1, [T1 + T0*2]
6046 call T1
6047 movdqu [A1], xmm0
6048
6049 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6050 IEMIMPL_SSE_EPILOGUE
6051 EPILOGUE_4_ARGS
6052 %assign bImm 0
6053 %rep 256
6054.imm %+ bImm:
6055 %1 xmm0, xmm1, bImm
6056 ret
6057 %assign bImm bImm + 1
6058 %endrep
6059.immEnd: ; 256*6 == 0x600
6060dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6061dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6062ENDPROC iemAImpl_ %+ %1 %+ _u128
6063%endmacro
6064
6065IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6066IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6067IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6068
6069;;
6070; SSE instructions of the form
6071; xxx mm, xmm.
6072; and we need to load and save the MXCSR register.
6073;
6074; @param 1 The instruction name.
6075;
6076; @param A0 Pointer to the MXCSR value (input/output).
6077; @param A1 Pointer to the first MMX register sized operand (output).
6078; @param A2 Pointer to the media register sized operand (input).
6079;
6080%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6081BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6082 PROLOGUE_3_ARGS
6083 IEMIMPL_SSE_PROLOGUE
6084 SSE_LD_FXSTATE_MXCSR_ONLY A0
6085
6086 movdqu xmm0, [A2]
6087 %1 mm0, xmm0
6088 movq [A1], mm0
6089
6090 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6091 IEMIMPL_SSE_EPILOGUE
6092 EPILOGUE_3_ARGS
6093ENDPROC iemAImpl_ %+ %1 %+ _u128
6094%endmacro
6095
6096IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6097IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6098
6099;;
6100; SSE instructions of the form
6101; xxx xmm, xmm/m64.
6102; and we need to load and save the MXCSR register.
6103;
6104; @param 1 The instruction name.
6105;
6106; @param A0 Pointer to the MXCSR value (input/output).
6107; @param A1 Pointer to the first media register sized operand (input/output).
6108; @param A2 The 64bit source value from a MMX media register (input)
6109;
6110%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6111BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6112 PROLOGUE_3_ARGS
6113 IEMIMPL_SSE_PROLOGUE
6114 SSE_LD_FXSTATE_MXCSR_ONLY A0
6115
6116 movdqu xmm0, [A1]
6117 movq mm0, A2
6118 %1 xmm0, mm0
6119 movdqu [A1], xmm0
6120
6121 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6122 IEMIMPL_SSE_EPILOGUE
6123 EPILOGUE_3_ARGS
6124ENDPROC iemAImpl_ %+ %1 %+ _u128
6125%endmacro
6126
6127IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6128IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6129
6130;;
6131; SSE instructions of the form
6132; xxx mm, xmm/m64.
6133; and we need to load and save the MXCSR register.
6134;
6135; @param 1 The instruction name.
6136;
6137; @param A0 Pointer to the MXCSR value (input/output).
6138; @param A1 Pointer to the first MMX media register sized operand (output).
6139; @param A2 The 64bit source value (input).
6140;
6141%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6142BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6143 PROLOGUE_3_ARGS
6144 IEMIMPL_SSE_PROLOGUE
6145 SSE_LD_FXSTATE_MXCSR_ONLY A0
6146
6147 movq xmm0, A2
6148 %1 mm0, xmm0
6149 movq [A1], mm0
6150
6151 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6152 IEMIMPL_SSE_EPILOGUE
6153 EPILOGUE_3_ARGS
6154ENDPROC iemAImpl_ %+ %1 %+ _u128
6155%endmacro
6156
6157IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6158IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette