VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 99813

Last change on this file since 99813 was 99790, checked in by vboxsync, 19 months ago

IEM: Fixed iemAImpl_rorx_u64 copy/paste error.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 184.9 KB
Line 
1; $Id: IEMAllAImpl.asm 99790 2023-05-15 13:10:32Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Load the relevant flags from [%1].
305;
306; @remarks Clobbers T0, stack. Changes EFLAGS.
307; @param A2 The register pointing to the flags.
308; @param 1 The parameter (A0..A3) pointing to the eflags.
309; @param 2 The set of flags to load.
310; @param 3 The set of undefined flags.
311;
312%macro IEM_LOAD_FLAGS 3
313 pushf ; store current flags
314 mov T0_32, [%1] ; load the guest flags
315 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
316 and T0_32, (%2 | %3) ; select the modified and undefined flags.
317 or [xSP], T0 ; merge guest flags with host flags.
318 popf ; load the mixed flags.
319%endmacro
320
321;;
322; Update the flag.
323;
324; @remarks Clobbers T0, T1, stack.
325; @param 1 The register pointing to the EFLAGS.
326; @param 2 The mask of modified flags to save.
327; @param 3 The mask of undefined flags to (maybe) save.
328;
329%macro IEM_SAVE_FLAGS 3
330 %if (%2 | %3) != 0
331 pushf
332 pop T1
333 mov T0_32, [%1] ; flags
334 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
335 and T1_32, (%2 | %3) ; select the modified and undefined flags.
336 or T0_32, T1_32 ; combine the flags.
337 mov [%1], T0_32 ; save the flags.
338 %endif
339%endmacro
340
341;;
342; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
343;
344; @remarks Clobbers T0, T1, stack.
345; @param 1 The register pointing to the EFLAGS.
346; @param 2 The mask of modified flags to save.
347; @param 3 Mask of additional flags to always clear
348; @param 4 Mask of additional flags to always set.
349;
350%macro IEM_SAVE_AND_ADJUST_FLAGS 4
351 %if (%2 | %3 | %4) != 0
352 pushf
353 pop T1
354 mov T0_32, [%1] ; load flags.
355 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
356 and T1_32, (%2) ; select the modified flags.
357 or T0_32, T1_32 ; combine the flags.
358 %if (%4) != 0
359 or T0_32, %4 ; add the always set flags.
360 %endif
361 mov [%1], T0_32 ; save the result.
362 %endif
363%endmacro
364
365;;
366; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
367; signed input (%4[%5]) and parity index (%6).
368;
369; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
370; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
371; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
372;
373; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
374; @param 1 The register pointing to the EFLAGS.
375; @param 2 The mask of modified flags to save.
376; @param 3 Mask of additional flags to always clear
377; @param 4 The result register to set SF by.
378; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
379; @param 6 The (full) register containing the parity table index. Will be modified!
380
381%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
382 %ifdef RT_ARCH_AMD64
383 pushf
384 pop T2
385 %else
386 push T0
387 pushf
388 pop T0
389 %endif
390 mov T1_32, [%1] ; load flags.
391 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
392 %ifdef RT_ARCH_AMD64
393 and T2_32, (%2) ; select the modified flags.
394 or T1_32, T2_32 ; combine the flags.
395 %else
396 and T0_32, (%2) ; select the modified flags.
397 or T1_32, T0_32 ; combine the flags.
398 pop T0
399 %endif
400
401 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
402 bt %4, %5 - 1
403 jnc %%sf_clear
404 or T1_32, X86_EFL_SF
405 %%sf_clear:
406
407 ; Parity last.
408 and %6, 0xff
409 %ifdef RT_ARCH_AMD64
410 lea T2, [NAME(g_afParity) xWrtRIP]
411 or T1_8, [T2 + %6]
412 %else
413 or T1_8, [NAME(g_afParity) + %6]
414 %endif
415
416 mov [%1], T1_32 ; save the result.
417%endmacro
418
419;;
420; Calculates the new EFLAGS using fixed clear and set bit masks.
421;
422; @remarks Clobbers T0.
423; @param 1 The register pointing to the EFLAGS.
424; @param 2 Mask of additional flags to always clear
425; @param 3 Mask of additional flags to always set.
426;
427%macro IEM_ADJUST_FLAGS 3
428 %if (%2 | %3) != 0
429 mov T0_32, [%1] ; Load flags.
430 %if (%2) != 0
431 and T0_32, ~(%2) ; Remove the always cleared flags.
432 %endif
433 %if (%3) != 0
434 or T0_32, %3 ; Add the always set flags.
435 %endif
436 mov [%1], T0_32 ; Save the result.
437 %endif
438%endmacro
439
440;;
441; Calculates the new EFLAGS using fixed clear and set bit masks.
442;
443; @remarks Clobbers T0, %4, EFLAGS.
444; @param 1 The register pointing to the EFLAGS.
445; @param 2 Mask of additional flags to always clear
446; @param 3 Mask of additional flags to always set.
447; @param 4 The (full) register containing the parity table index. Will be modified!
448;
449%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
450 mov T0_32, [%1] ; Load flags.
451 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
452 %if (%3) != 0
453 or T0_32, %3 ; Add the always set flags.
454 %endif
455 and %4, 0xff
456 %ifdef RT_ARCH_AMD64
457 lea T2, [NAME(g_afParity) xWrtRIP]
458 or T0_8, [T2 + %4]
459 %else
460 or T0_8, [NAME(g_afParity) + %4]
461 %endif
462 mov [%1], T0_32 ; Save the result.
463%endmacro
464
465
466;*********************************************************************************************************************************
467;* External Symbols *
468;*********************************************************************************************************************************
469extern NAME(g_afParity)
470
471
472;;
473; Macro for implementing a binary operator.
474;
475; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
476; variants, except on 32-bit system where the 64-bit accesses requires hand
477; coding.
478;
479; All the functions takes a pointer to the destination memory operand in A0,
480; the source register operand in A1 and a pointer to eflags in A2.
481;
482; @param 1 The instruction mnemonic.
483; @param 2 Non-zero if there should be a locked version.
484; @param 3 The modified flags.
485; @param 4 The undefined flags.
486;
487%macro IEMIMPL_BIN_OP 4
488BEGINCODE
489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
490 PROLOGUE_3_ARGS
491 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
492 %1 byte [A0], A1_8
493 IEM_SAVE_FLAGS A2, %3, %4
494 EPILOGUE_3_ARGS
495ENDPROC iemAImpl_ %+ %1 %+ _u8
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 %1 word [A0], A1_16
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u16
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 %1 dword [A0], A1_32
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u32
512
513 %ifdef RT_ARCH_AMD64
514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
515 PROLOGUE_3_ARGS
516 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
517 %1 qword [A0], A1
518 IEM_SAVE_FLAGS A2, %3, %4
519 EPILOGUE_3_ARGS_EX 8
520ENDPROC iemAImpl_ %+ %1 %+ _u64
521 %endif ; RT_ARCH_AMD64
522
523 %if %2 != 0 ; locked versions requested?
524
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
528 lock %1 byte [A0], A1_8
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS
531ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
532
533BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
534 PROLOGUE_3_ARGS
535 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
536 lock %1 word [A0], A1_16
537 IEM_SAVE_FLAGS A2, %3, %4
538 EPILOGUE_3_ARGS
539ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
540
541BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
542 PROLOGUE_3_ARGS
543 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
544 lock %1 dword [A0], A1_32
545 IEM_SAVE_FLAGS A2, %3, %4
546 EPILOGUE_3_ARGS
547ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
548
549 %ifdef RT_ARCH_AMD64
550BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
551 PROLOGUE_3_ARGS
552 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
553 lock %1 qword [A0], A1
554 IEM_SAVE_FLAGS A2, %3, %4
555 EPILOGUE_3_ARGS_EX 8
556ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
557 %endif ; RT_ARCH_AMD64
558 %endif ; locked
559%endmacro
560
561; instr,lock, modified-flags, undefined flags
562IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
563IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
564IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
565IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
566IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
567IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
568IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
569IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
570IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
571
572
573;;
574; Macro for implementing a binary operator, VEX variant with separate input/output.
575;
576; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
577; where the 64-bit accesses requires hand coding.
578;
579; All the functions takes a pointer to the destination memory operand in A0,
580; the first source register operand in A1, the second source register operand
581; in A2 and a pointer to eflags in A3.
582;
583; @param 1 The instruction mnemonic.
584; @param 2 The modified flags.
585; @param 3 The undefined flags.
586;
587%macro IEMIMPL_VEX_BIN_OP 3
588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
589 PROLOGUE_4_ARGS
590 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
591 %1 T0_32, A1_32, A2_32
592 mov [A0], T0_32
593 IEM_SAVE_FLAGS A3, %2, %3
594 EPILOGUE_4_ARGS
595ENDPROC iemAImpl_ %+ %1 %+ _u32
596
597 %ifdef RT_ARCH_AMD64
598BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
599 PROLOGUE_4_ARGS
600 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
601 %1 T0, A1, A2
602 mov [A0], T0
603 IEM_SAVE_FLAGS A3, %2, %3
604 EPILOGUE_4_ARGS
605ENDPROC iemAImpl_ %+ %1 %+ _u64
606 %endif ; RT_ARCH_AMD64
607%endmacro
608
609; instr, modified-flags, undefined-flags
610IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
611IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
612IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
613
614;;
615; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
616;
617; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
618; where the 64-bit accesses requires hand coding.
619;
620; All the functions takes a pointer to the destination memory operand in A0,
621; the source register operand in A1 and a pointer to eflags in A2.
622;
623; @param 1 The instruction mnemonic.
624; @param 2 The modified flags.
625; @param 3 The undefined flags.
626;
627%macro IEMIMPL_VEX_BIN_OP_2 3
628BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
629 PROLOGUE_4_ARGS
630 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
631 mov T0_32, [A0]
632 %1 T0_32, A1_32
633 mov [A0], T0_32
634 IEM_SAVE_FLAGS A2, %2, %3
635 EPILOGUE_4_ARGS
636ENDPROC iemAImpl_ %+ %1 %+ _u32
637
638 %ifdef RT_ARCH_AMD64
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0, [A0]
643 %1 T0, A1
644 mov [A0], T0
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u64
648 %endif ; RT_ARCH_AMD64
649%endmacro
650
651; instr, modified-flags, undefined-flags
652IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
653IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
654IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
655
656
657;;
658; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
659;
660; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
661; where the 64-bit accesses requires hand coding.
662;
663; All the functions takes a pointer to the destination memory operand in A0,
664; the first source register operand in A1, the second source register operand
665; in A2 and a pointer to eflags in A3.
666;
667; @param 1 The instruction mnemonic.
668; @param 2 Fallback instruction if applicable.
669; @param 3 Whether to emit fallback or not.
670;
671%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
673 PROLOGUE_3_ARGS
674 %1 T0_32, A1_32, A2_32
675 mov [A0], T0_32
676 EPILOGUE_3_ARGS
677ENDPROC iemAImpl_ %+ %1 %+ _u32
678
679 %if %3
680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
681 PROLOGUE_3_ARGS
682 %ifdef ASM_CALL64_GCC
683 mov cl, A2_8
684 %2 A1_32, cl
685 mov [A0], A1_32
686 %else
687 xchg A2, A0
688 %2 A1_32, cl
689 mov [A2], A1_32
690 %endif
691 EPILOGUE_3_ARGS
692ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
693 %endif
694
695 %ifdef RT_ARCH_AMD64
696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
697 PROLOGUE_3_ARGS
698 %1 T0, A1, A2
699 mov [A0], T0
700 EPILOGUE_3_ARGS
701ENDPROC iemAImpl_ %+ %1 %+ _u64
702
703 %if %3
704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
705 PROLOGUE_3_ARGS
706 %ifdef ASM_CALL64_GCC
707 mov cl, A2_8
708 %2 A1, cl
709 mov [A0], A1_32
710 %else
711 xchg A2, A0
712 %2 A1, cl
713 mov [A2], A1_32
714 %endif
715 mov [A0], A1
716 EPILOGUE_3_ARGS
717ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
718 %endif
719 %endif ; RT_ARCH_AMD64
720%endmacro
721
722; instr, fallback instr, emit fallback
723IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
724IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
725IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
726IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
727IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
728
729
730;
731; RORX uses a immediate byte for the shift count, so we only do
732; fallback implementation of that one.
733;
734BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
735 PROLOGUE_3_ARGS
736 %ifdef ASM_CALL64_GCC
737 mov cl, A2_8
738 ror A1_32, cl
739 mov [A0], A1_32
740 %else
741 xchg A2, A0
742 ror A1_32, cl
743 mov [A2], A1_32
744 %endif
745 EPILOGUE_3_ARGS
746ENDPROC iemAImpl_rorx_u32
747
748 %ifdef RT_ARCH_AMD64
749BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
750 PROLOGUE_3_ARGS
751 %ifdef ASM_CALL64_GCC
752 mov cl, A2_8
753 ror A1, cl
754 mov [A0], A1
755 %else
756 xchg A2, A0
757 ror A1, cl
758 mov [A2], A1
759 %endif
760 EPILOGUE_3_ARGS
761ENDPROC iemAImpl_rorx_u64
762 %endif ; RT_ARCH_AMD64
763
764
765;
766; MULX
767;
768BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
769 PROLOGUE_4_ARGS
770%ifdef ASM_CALL64_GCC
771 ; A2_32 is EDX - prefect
772 mulx T0_32, T1_32, A3_32
773 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
774 mov [A0], T0_32
775%else
776 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
777 xchg A1, A2
778 mulx T0_32, T1_32, A3_32
779 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
780 mov [A0], T0_32
781%endif
782 EPILOGUE_4_ARGS
783ENDPROC iemAImpl_mulx_u32
784
785
786BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX, T0_32 is EAX
790 mov eax, A3_32
791 mul A2_32
792 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
793 mov [A0], edx
794%else
795 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
796 xchg A1, A2
797 mov eax, A3_32
798 mul A2_32
799 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
800 mov [A0], edx
801%endif
802 EPILOGUE_4_ARGS
803ENDPROC iemAImpl_mulx_u32_fallback
804
805%ifdef RT_ARCH_AMD64
806BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
807 PROLOGUE_4_ARGS
808%ifdef ASM_CALL64_GCC
809 ; A2 is RDX - prefect
810 mulx T0, T1, A3
811 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
812 mov [A0], T0
813%else
814 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
815 xchg A1, A2
816 mulx T0, T1, A3
817 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], T0
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u64
822
823
824BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX, T0 is RAX
828 mov rax, A3
829 mul A2
830 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
831 mov [A0], rdx
832%else
833 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
834 xchg A1, A2
835 mov rax, A3
836 mul A2
837 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
838 mov [A0], rdx
839%endif
840 EPILOGUE_4_ARGS
841ENDPROC iemAImpl_mulx_u64_fallback
842
843%endif
844
845
846;;
847; Macro for implementing a bit operator.
848;
849; This will generate code for the 16, 32 and 64 bit accesses with locked
850; variants, except on 32-bit system where the 64-bit accesses requires hand
851; coding.
852;
853; All the functions takes a pointer to the destination memory operand in A0,
854; the source register operand in A1 and a pointer to eflags in A2.
855;
856; @param 1 The instruction mnemonic.
857; @param 2 Non-zero if there should be a locked version.
858; @param 3 The modified flags.
859; @param 4 The undefined flags.
860;
861%macro IEMIMPL_BIT_OP 4
862BEGINCODE
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 word [A0], A1_16
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS
869ENDPROC iemAImpl_ %+ %1 %+ _u16
870
871BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
872 PROLOGUE_3_ARGS
873 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
874 %1 dword [A0], A1_32
875 IEM_SAVE_FLAGS A2, %3, %4
876 EPILOGUE_3_ARGS
877ENDPROC iemAImpl_ %+ %1 %+ _u32
878
879 %ifdef RT_ARCH_AMD64
880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
881 PROLOGUE_3_ARGS
882 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
883 %1 qword [A0], A1
884 IEM_SAVE_FLAGS A2, %3, %4
885 EPILOGUE_3_ARGS_EX 8
886ENDPROC iemAImpl_ %+ %1 %+ _u64
887 %endif ; RT_ARCH_AMD64
888
889 %if %2 != 0 ; locked versions requested?
890
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 word [A0], A1_16
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS
897ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
898
899BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
900 PROLOGUE_3_ARGS
901 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
902 lock %1 dword [A0], A1_32
903 IEM_SAVE_FLAGS A2, %3, %4
904 EPILOGUE_3_ARGS
905ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
906
907 %ifdef RT_ARCH_AMD64
908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
909 PROLOGUE_3_ARGS
910 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
911 lock %1 qword [A0], A1
912 IEM_SAVE_FLAGS A2, %3, %4
913 EPILOGUE_3_ARGS_EX 8
914ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
915 %endif ; RT_ARCH_AMD64
916 %endif ; locked
917%endmacro
918IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
919IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
920IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
921IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
922
923;;
924; Macro for implementing a bit search operator.
925;
926; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
927; system where the 64-bit accesses requires hand coding.
928;
929; All the functions takes a pointer to the destination memory operand in A0,
930; the source register operand in A1 and a pointer to eflags in A2.
931;
932; In the ZF case the destination register is 'undefined', however it seems that
933; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
934; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
935; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
936; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
937;
938; @param 1 The instruction mnemonic.
939; @param 2 The modified flags.
940; @param 3 The undefined flags.
941; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
942;
943%macro IEMIMPL_BIT_OP2 4
944BEGINCODE
945BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
946 PROLOGUE_3_ARGS
947 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
948 %1 T0_16, A1_16
949%if %4 != 0
950 jz .unchanged_dst
951%endif
952 mov [A0], T0_16
953.unchanged_dst:
954 IEM_SAVE_FLAGS A2, %2, %3
955 EPILOGUE_3_ARGS
956ENDPROC iemAImpl_ %+ %1 %+ _u16
957
958BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
959 PROLOGUE_3_ARGS
960 %1 T1_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T1_16
965 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
966 EPILOGUE_3_ARGS
967.unchanged_dst:
968 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
969 EPILOGUE_3_ARGS
970ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
971
972BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
973 PROLOGUE_3_ARGS
974 %1 T0_16, A1_16
975%if %4 != 0
976 jz .unchanged_dst
977%endif
978 mov [A0], T0_16
979.unchanged_dst:
980 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
983
984
985BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
986 PROLOGUE_3_ARGS
987 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
988 %1 T0_32, A1_32
989%if %4 != 0
990 jz .unchanged_dst
991%endif
992 mov [A0], T0_32
993.unchanged_dst:
994 IEM_SAVE_FLAGS A2, %2, %3
995 EPILOGUE_3_ARGS
996ENDPROC iemAImpl_ %+ %1 %+ _u32
997
998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
999 PROLOGUE_3_ARGS
1000 %1 T1_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T1_32
1005 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1006 EPILOGUE_3_ARGS
1007.unchanged_dst:
1008 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1009 EPILOGUE_3_ARGS
1010ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1011
1012BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1013 PROLOGUE_3_ARGS
1014 %1 T0_32, A1_32
1015%if %4 != 0
1016 jz .unchanged_dst
1017%endif
1018 mov [A0], T0_32
1019.unchanged_dst:
1020 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1023
1024
1025 %ifdef RT_ARCH_AMD64
1026
1027BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1028 PROLOGUE_3_ARGS
1029 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1030 %1 T0, A1
1031%if %4 != 0
1032 jz .unchanged_dst
1033%endif
1034 mov [A0], T0
1035.unchanged_dst:
1036 IEM_SAVE_FLAGS A2, %2, %3
1037 EPILOGUE_3_ARGS_EX 8
1038ENDPROC iemAImpl_ %+ %1 %+ _u64
1039
1040BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1041 PROLOGUE_3_ARGS
1042 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1043 %1 T1, A1
1044%if %4 != 0
1045 jz .unchanged_dst
1046%endif
1047 mov [A0], T1
1048 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1049 EPILOGUE_3_ARGS
1050.unchanged_dst:
1051 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1052 EPILOGUE_3_ARGS
1053ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1054
1055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1056 PROLOGUE_3_ARGS
1057 %1 T0, A1
1058%if %4 != 0
1059 jz .unchanged_dst
1060%endif
1061 mov [A0], T0
1062.unchanged_dst:
1063 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1064 EPILOGUE_3_ARGS_EX 8
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1066
1067 %endif ; RT_ARCH_AMD64
1068%endmacro
1069
1070IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1071IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1072IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1073IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1074
1075
1076;;
1077; Macro for implementing POPCNT.
1078;
1079; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1080; system where the 64-bit accesses requires hand coding.
1081;
1082; All the functions takes a pointer to the destination memory operand in A0,
1083; the source register operand in A1 and a pointer to eflags in A2.
1084;
1085; ASSUMES Intel and AMD set EFLAGS the same way.
1086;
1087; ASSUMES the instruction does not support memory destination.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092;
1093%macro IEMIMPL_BIT_OP3 3
1094BEGINCODE
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1096 PROLOGUE_3_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1098 %1 T0_16, A1_16
1099 mov [A0], T0_16
1100 IEM_SAVE_FLAGS A2, %2, %3
1101 EPILOGUE_3_ARGS
1102ENDPROC iemAImpl_ %+ %1 %+ _u16
1103
1104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1105 PROLOGUE_3_ARGS
1106 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1107 %1 T0_32, A1_32
1108 mov [A0], T0_32
1109 IEM_SAVE_FLAGS A2, %2, %3
1110 EPILOGUE_3_ARGS
1111ENDPROC iemAImpl_ %+ %1 %+ _u32
1112
1113 %ifdef RT_ARCH_AMD64
1114BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1115 PROLOGUE_3_ARGS
1116 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1117 %1 T0, A1
1118 mov [A0], T0
1119 IEM_SAVE_FLAGS A2, %2, %3
1120 EPILOGUE_3_ARGS_EX 8
1121ENDPROC iemAImpl_ %+ %1 %+ _u64
1122 %endif ; RT_ARCH_AMD64
1123%endmacro
1124IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1125
1126
1127;
1128; IMUL is also a similar but yet different case (no lock, no mem dst).
1129; The rDX:rAX variant of imul is handled together with mul further down.
1130;
1131BEGINCODE
1132; @param 1 EFLAGS that are modified.
1133; @param 2 Undefined EFLAGS.
1134; @param 3 Function suffix.
1135; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1136; 2 for AMD (set AF, clear PF, ZF and SF).
1137%macro IEMIMPL_IMUL_TWO 4
1138BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1139 PROLOGUE_3_ARGS
1140 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1141 imul A1_16, word [A0]
1142 mov [A0], A1_16
1143 %if %4 != 1
1144 IEM_SAVE_FLAGS A2, %1, %2
1145 %else
1146 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1147 %endif
1148 EPILOGUE_3_ARGS
1149ENDPROC iemAImpl_imul_two_u16 %+ %3
1150
1151BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1152 PROLOGUE_3_ARGS
1153 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1154 imul A1_32, dword [A0]
1155 mov [A0], A1_32
1156 %if %4 != 1
1157 IEM_SAVE_FLAGS A2, %1, %2
1158 %else
1159 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1160 %endif
1161 EPILOGUE_3_ARGS
1162ENDPROC iemAImpl_imul_two_u32 %+ %3
1163
1164 %ifdef RT_ARCH_AMD64
1165BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1166 PROLOGUE_3_ARGS
1167 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1168 imul A1, qword [A0]
1169 mov [A0], A1
1170 %if %4 != 1
1171 IEM_SAVE_FLAGS A2, %1, %2
1172 %else
1173 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1174 %endif
1175 EPILOGUE_3_ARGS_EX 8
1176ENDPROC iemAImpl_imul_two_u64 %+ %3
1177 %endif ; RT_ARCH_AMD64
1178%endmacro
1179IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1180IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1181IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1182
1183
1184;
1185; XCHG for memory operands. This implies locking. No flag changes.
1186;
1187; Each function takes two arguments, first the pointer to the memory,
1188; then the pointer to the register. They all return void.
1189;
1190BEGINCODE
1191BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1192 PROLOGUE_2_ARGS
1193 mov T0_8, [A1]
1194 xchg [A0], T0_8
1195 mov [A1], T0_8
1196 EPILOGUE_2_ARGS
1197ENDPROC iemAImpl_xchg_u8_locked
1198
1199BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0_16, [A1]
1202 xchg [A0], T0_16
1203 mov [A1], T0_16
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u16_locked
1206
1207BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1208 PROLOGUE_2_ARGS
1209 mov T0_32, [A1]
1210 xchg [A0], T0_32
1211 mov [A1], T0_32
1212 EPILOGUE_2_ARGS
1213ENDPROC iemAImpl_xchg_u32_locked
1214
1215%ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1217 PROLOGUE_2_ARGS
1218 mov T0, [A1]
1219 xchg [A0], T0
1220 mov [A1], T0
1221 EPILOGUE_2_ARGS
1222ENDPROC iemAImpl_xchg_u64_locked
1223%endif
1224
1225; Unlocked variants for fDisregardLock mode.
1226
1227BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1228 PROLOGUE_2_ARGS
1229 mov T0_8, [A1]
1230 mov T1_8, [A0]
1231 mov [A0], T0_8
1232 mov [A1], T1_8
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u8_unlocked
1235
1236BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1237 PROLOGUE_2_ARGS
1238 mov T0_16, [A1]
1239 mov T1_16, [A0]
1240 mov [A0], T0_16
1241 mov [A1], T1_16
1242 EPILOGUE_2_ARGS
1243ENDPROC iemAImpl_xchg_u16_unlocked
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_32, [A1]
1248 mov T1_32, [A0]
1249 mov [A0], T0_32
1250 mov [A1], T1_32
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u32_unlocked
1253
1254%ifdef RT_ARCH_AMD64
1255BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1256 PROLOGUE_2_ARGS
1257 mov T0, [A1]
1258 mov T1, [A0]
1259 mov [A0], T0
1260 mov [A1], T1
1261 EPILOGUE_2_ARGS
1262ENDPROC iemAImpl_xchg_u64_unlocked
1263%endif
1264
1265
1266;
1267; XADD for memory operands.
1268;
1269; Each function takes three arguments, first the pointer to the
1270; memory/register, then the pointer to the register, and finally a pointer to
1271; eflags. They all return void.
1272;
1273BEGINCODE
1274BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1275 PROLOGUE_3_ARGS
1276 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1277 mov T0_8, [A1]
1278 xadd [A0], T0_8
1279 mov [A1], T0_8
1280 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1281 EPILOGUE_3_ARGS
1282ENDPROC iemAImpl_xadd_u8
1283
1284BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1285 PROLOGUE_3_ARGS
1286 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1287 mov T0_16, [A1]
1288 xadd [A0], T0_16
1289 mov [A1], T0_16
1290 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 EPILOGUE_3_ARGS
1292ENDPROC iemAImpl_xadd_u16
1293
1294BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1295 PROLOGUE_3_ARGS
1296 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1297 mov T0_32, [A1]
1298 xadd [A0], T0_32
1299 mov [A1], T0_32
1300 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1301 EPILOGUE_3_ARGS
1302ENDPROC iemAImpl_xadd_u32
1303
1304%ifdef RT_ARCH_AMD64
1305BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1306 PROLOGUE_3_ARGS
1307 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1308 mov T0, [A1]
1309 xadd [A0], T0
1310 mov [A1], T0
1311 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 EPILOGUE_3_ARGS
1313ENDPROC iemAImpl_xadd_u64
1314%endif ; RT_ARCH_AMD64
1315
1316BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1317 PROLOGUE_3_ARGS
1318 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 mov T0_8, [A1]
1320 lock xadd [A0], T0_8
1321 mov [A1], T0_8
1322 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1323 EPILOGUE_3_ARGS
1324ENDPROC iemAImpl_xadd_u8_locked
1325
1326BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1327 PROLOGUE_3_ARGS
1328 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1329 mov T0_16, [A1]
1330 lock xadd [A0], T0_16
1331 mov [A1], T0_16
1332 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 EPILOGUE_3_ARGS
1334ENDPROC iemAImpl_xadd_u16_locked
1335
1336BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1337 PROLOGUE_3_ARGS
1338 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1339 mov T0_32, [A1]
1340 lock xadd [A0], T0_32
1341 mov [A1], T0_32
1342 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_xadd_u32_locked
1345
1346%ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1350 mov T0, [A1]
1351 lock xadd [A0], T0
1352 mov [A1], T0
1353 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1354 EPILOGUE_3_ARGS
1355ENDPROC iemAImpl_xadd_u64_locked
1356%endif ; RT_ARCH_AMD64
1357
1358
1359;
1360; CMPXCHG8B.
1361;
1362; These are tricky register wise, so the code is duplicated for each calling
1363; convention.
1364;
1365; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1366;
1367; C-proto:
1368; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1369; uint32_t *pEFlags));
1370;
1371; Note! Identical to iemAImpl_cmpxchg16b.
1372;
1373BEGINCODE
1374BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1375%ifdef RT_ARCH_AMD64
1376 %ifdef ASM_CALL64_MSC
1377 push rbx
1378
1379 mov r11, rdx ; pu64EaxEdx (is also T1)
1380 mov r10, rcx ; pu64Dst
1381
1382 mov ebx, [r8]
1383 mov ecx, [r8 + 4]
1384 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1385 mov eax, [r11]
1386 mov edx, [r11 + 4]
1387
1388 lock cmpxchg8b [r10]
1389
1390 mov [r11], eax
1391 mov [r11 + 4], edx
1392 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1393
1394 pop rbx
1395 ret
1396 %else
1397 push rbx
1398
1399 mov r10, rcx ; pEFlags
1400 mov r11, rdx ; pu64EbxEcx (is also T1)
1401
1402 mov ebx, [r11]
1403 mov ecx, [r11 + 4]
1404 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1405 mov eax, [rsi]
1406 mov edx, [rsi + 4]
1407
1408 lock cmpxchg8b [rdi]
1409
1410 mov [rsi], eax
1411 mov [rsi + 4], edx
1412 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1413
1414 pop rbx
1415 ret
1416
1417 %endif
1418%else
1419 push esi
1420 push edi
1421 push ebx
1422 push ebp
1423
1424 mov edi, ecx ; pu64Dst
1425 mov esi, edx ; pu64EaxEdx
1426 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1427 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1428
1429 mov ebx, [ecx]
1430 mov ecx, [ecx + 4]
1431 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1432 mov eax, [esi]
1433 mov edx, [esi + 4]
1434
1435 lock cmpxchg8b [edi]
1436
1437 mov [esi], eax
1438 mov [esi + 4], edx
1439 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1440
1441 pop ebp
1442 pop ebx
1443 pop edi
1444 pop esi
1445 ret 8
1446%endif
1447ENDPROC iemAImpl_cmpxchg8b
1448
1449BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1450 ; Lazy bird always lock prefixes cmpxchg8b.
1451 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1452ENDPROC iemAImpl_cmpxchg8b_locked
1453
1454%ifdef RT_ARCH_AMD64
1455
1456;
1457; CMPXCHG16B.
1458;
1459; These are tricky register wise, so the code is duplicated for each calling
1460; convention.
1461;
1462; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1463;
1464; C-proto:
1465; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1466; uint32_t *pEFlags));
1467;
1468; Note! Identical to iemAImpl_cmpxchg8b.
1469;
1470BEGINCODE
1471BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1472 %ifdef ASM_CALL64_MSC
1473 push rbx
1474
1475 mov r11, rdx ; pu64RaxRdx (is also T1)
1476 mov r10, rcx ; pu64Dst
1477
1478 mov rbx, [r8]
1479 mov rcx, [r8 + 8]
1480 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1481 mov rax, [r11]
1482 mov rdx, [r11 + 8]
1483
1484 lock cmpxchg16b [r10]
1485
1486 mov [r11], rax
1487 mov [r11 + 8], rdx
1488 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1489
1490 pop rbx
1491 ret
1492 %else
1493 push rbx
1494
1495 mov r10, rcx ; pEFlags
1496 mov r11, rdx ; pu64RbxRcx (is also T1)
1497
1498 mov rbx, [r11]
1499 mov rcx, [r11 + 8]
1500 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1501 mov rax, [rsi]
1502 mov rdx, [rsi + 8]
1503
1504 lock cmpxchg16b [rdi]
1505
1506 mov [rsi], rax
1507 mov [rsi + 8], rdx
1508 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1509
1510 pop rbx
1511 ret
1512
1513 %endif
1514ENDPROC iemAImpl_cmpxchg16b
1515
1516BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1517 ; Lazy bird always lock prefixes cmpxchg16b.
1518 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1519ENDPROC iemAImpl_cmpxchg16b_locked
1520
1521%endif ; RT_ARCH_AMD64
1522
1523
1524;
1525; CMPXCHG.
1526;
1527; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1528;
1529; C-proto:
1530; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1531;
1532BEGINCODE
1533%macro IEMIMPL_CMPXCHG 2
1534BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1535 PROLOGUE_4_ARGS
1536 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1537 mov al, [A1]
1538 %1 cmpxchg [A0], A2_8
1539 mov [A1], al
1540 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1541 EPILOGUE_4_ARGS
1542ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1543
1544BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1545 PROLOGUE_4_ARGS
1546 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1547 mov ax, [A1]
1548 %1 cmpxchg [A0], A2_16
1549 mov [A1], ax
1550 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1551 EPILOGUE_4_ARGS
1552ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1553
1554BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1555 PROLOGUE_4_ARGS
1556 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1557 mov eax, [A1]
1558 %1 cmpxchg [A0], A2_32
1559 mov [A1], eax
1560 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1561 EPILOGUE_4_ARGS
1562ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1563
1564BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1565%ifdef RT_ARCH_AMD64
1566 PROLOGUE_4_ARGS
1567 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1568 mov rax, [A1]
1569 %1 cmpxchg [A0], A2
1570 mov [A1], rax
1571 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1572 EPILOGUE_4_ARGS
1573%else
1574 ;
1575 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1576 ;
1577 push esi
1578 push edi
1579 push ebx
1580 push ebp
1581
1582 mov edi, ecx ; pu64Dst
1583 mov esi, edx ; pu64Rax
1584 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1585 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1586
1587 mov ebx, [ecx]
1588 mov ecx, [ecx + 4]
1589 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1590 mov eax, [esi]
1591 mov edx, [esi + 4]
1592
1593 lock cmpxchg8b [edi]
1594
1595 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1596 jz .cmpxchg8b_not_equal
1597 cmp eax, eax ; just set the other flags.
1598.store:
1599 mov [esi], eax
1600 mov [esi + 4], edx
1601 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1602
1603 pop ebp
1604 pop ebx
1605 pop edi
1606 pop esi
1607 ret 8
1608
1609.cmpxchg8b_not_equal:
1610 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1611 jne .store
1612 cmp [esi], eax
1613 jmp .store
1614
1615%endif
1616ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1617%endmacro ; IEMIMPL_CMPXCHG
1618
1619IEMIMPL_CMPXCHG , ,
1620IEMIMPL_CMPXCHG lock, _locked
1621
1622;;
1623; Macro for implementing a unary operator.
1624;
1625; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1626; variants, except on 32-bit system where the 64-bit accesses requires hand
1627; coding.
1628;
1629; All the functions takes a pointer to the destination memory operand in A0,
1630; the source register operand in A1 and a pointer to eflags in A2.
1631;
1632; @param 1 The instruction mnemonic.
1633; @param 2 The modified flags.
1634; @param 3 The undefined flags.
1635;
1636%macro IEMIMPL_UNARY_OP 3
1637BEGINCODE
1638BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1639 PROLOGUE_2_ARGS
1640 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1641 %1 byte [A0]
1642 IEM_SAVE_FLAGS A1, %2, %3
1643 EPILOGUE_2_ARGS
1644ENDPROC iemAImpl_ %+ %1 %+ _u8
1645
1646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1647 PROLOGUE_2_ARGS
1648 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1649 lock %1 byte [A0]
1650 IEM_SAVE_FLAGS A1, %2, %3
1651 EPILOGUE_2_ARGS
1652ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1653
1654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1655 PROLOGUE_2_ARGS
1656 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1657 %1 word [A0]
1658 IEM_SAVE_FLAGS A1, %2, %3
1659 EPILOGUE_2_ARGS
1660ENDPROC iemAImpl_ %+ %1 %+ _u16
1661
1662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1663 PROLOGUE_2_ARGS
1664 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1665 lock %1 word [A0]
1666 IEM_SAVE_FLAGS A1, %2, %3
1667 EPILOGUE_2_ARGS
1668ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1669
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 dword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u32
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 dword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1685
1686 %ifdef RT_ARCH_AMD64
1687BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1688 PROLOGUE_2_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1690 %1 qword [A0]
1691 IEM_SAVE_FLAGS A1, %2, %3
1692 EPILOGUE_2_ARGS
1693ENDPROC iemAImpl_ %+ %1 %+ _u64
1694
1695BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1696 PROLOGUE_2_ARGS
1697 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1698 lock %1 qword [A0]
1699 IEM_SAVE_FLAGS A1, %2, %3
1700 EPILOGUE_2_ARGS
1701ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1702 %endif ; RT_ARCH_AMD64
1703
1704%endmacro
1705
1706IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1707IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1708IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1709IEMIMPL_UNARY_OP not, 0, 0
1710
1711
1712;
1713; BSWAP. No flag changes.
1714;
1715; Each function takes one argument, pointer to the value to bswap
1716; (input/output). They all return void.
1717;
1718BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1719 PROLOGUE_1_ARGS
1720 mov T0_32, [A0] ; just in case any of the upper bits are used.
1721 db 66h
1722 bswap T0_32
1723 mov [A0], T0_32
1724 EPILOGUE_1_ARGS
1725ENDPROC iemAImpl_bswap_u16
1726
1727BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1728 PROLOGUE_1_ARGS
1729 mov T0_32, [A0]
1730 bswap T0_32
1731 mov [A0], T0_32
1732 EPILOGUE_1_ARGS
1733ENDPROC iemAImpl_bswap_u32
1734
1735BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1736%ifdef RT_ARCH_AMD64
1737 PROLOGUE_1_ARGS
1738 mov T0, [A0]
1739 bswap T0
1740 mov [A0], T0
1741 EPILOGUE_1_ARGS
1742%else
1743 PROLOGUE_1_ARGS
1744 mov T0, [A0]
1745 mov T1, [A0 + 4]
1746 bswap T0
1747 bswap T1
1748 mov [A0 + 4], T0
1749 mov [A0], T1
1750 EPILOGUE_1_ARGS
1751%endif
1752ENDPROC iemAImpl_bswap_u64
1753
1754
1755;;
1756; Macro for implementing a shift operation.
1757;
1758; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1759; 32-bit system where the 64-bit accesses requires hand coding.
1760;
1761; All the functions takes a pointer to the destination memory operand in A0,
1762; the shift count in A1 and a pointer to eflags in A2.
1763;
1764; @param 1 The instruction mnemonic.
1765; @param 2 The modified flags.
1766; @param 3 The undefined flags.
1767;
1768; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1769;
1770; @note the _intel and _amd variants are implemented in C.
1771;
1772%macro IEMIMPL_SHIFT_OP 3
1773BEGINCODE
1774BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1775 PROLOGUE_3_ARGS
1776 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1777 %ifdef ASM_CALL64_GCC
1778 mov cl, A1_8
1779 %1 byte [A0], cl
1780 %else
1781 xchg A1, A0
1782 %1 byte [A1], cl
1783 %endif
1784 IEM_SAVE_FLAGS A2, %2, %3
1785 EPILOGUE_3_ARGS
1786ENDPROC iemAImpl_ %+ %1 %+ _u8
1787
1788BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1789 PROLOGUE_3_ARGS
1790 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1791 %ifdef ASM_CALL64_GCC
1792 mov cl, A1_8
1793 %1 word [A0], cl
1794 %else
1795 xchg A1, A0
1796 %1 word [A1], cl
1797 %endif
1798 IEM_SAVE_FLAGS A2, %2, %3
1799 EPILOGUE_3_ARGS
1800ENDPROC iemAImpl_ %+ %1 %+ _u16
1801
1802BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1803 PROLOGUE_3_ARGS
1804 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1805 %ifdef ASM_CALL64_GCC
1806 mov cl, A1_8
1807 %1 dword [A0], cl
1808 %else
1809 xchg A1, A0
1810 %1 dword [A1], cl
1811 %endif
1812 IEM_SAVE_FLAGS A2, %2, %3
1813 EPILOGUE_3_ARGS
1814ENDPROC iemAImpl_ %+ %1 %+ _u32
1815
1816 %ifdef RT_ARCH_AMD64
1817BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1820 %ifdef ASM_CALL64_GCC
1821 mov cl, A1_8
1822 %1 qword [A0], cl
1823 %else
1824 xchg A1, A0
1825 %1 qword [A1], cl
1826 %endif
1827 IEM_SAVE_FLAGS A2, %2, %3
1828 EPILOGUE_3_ARGS
1829ENDPROC iemAImpl_ %+ %1 %+ _u64
1830 %endif ; RT_ARCH_AMD64
1831
1832%endmacro
1833
1834IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1835IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1836IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1837IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1838IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1839IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1840IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1841
1842
1843;;
1844; Macro for implementing a double precision shift operation.
1845;
1846; This will generate code for the 16, 32 and 64 bit accesses, except on
1847; 32-bit system where the 64-bit accesses requires hand coding.
1848;
1849; The functions takes the destination operand (r/m) in A0, the source (reg) in
1850; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1851;
1852; @param 1 The instruction mnemonic.
1853; @param 2 The modified flags.
1854; @param 3 The undefined flags.
1855;
1856; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1857;
1858; @note the _intel and _amd variants are implemented in C.
1859;
1860%macro IEMIMPL_SHIFT_DBL_OP 3
1861BEGINCODE
1862BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1863 PROLOGUE_4_ARGS
1864 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1865 %ifdef ASM_CALL64_GCC
1866 xchg A3, A2
1867 %1 [A0], A1_16, cl
1868 xchg A3, A2
1869 %else
1870 xchg A0, A2
1871 %1 [A2], A1_16, cl
1872 %endif
1873 IEM_SAVE_FLAGS A3, %2, %3
1874 EPILOGUE_4_ARGS
1875ENDPROC iemAImpl_ %+ %1 %+ _u16
1876
1877BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1878 PROLOGUE_4_ARGS
1879 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1880 %ifdef ASM_CALL64_GCC
1881 xchg A3, A2
1882 %1 [A0], A1_32, cl
1883 xchg A3, A2
1884 %else
1885 xchg A0, A2
1886 %1 [A2], A1_32, cl
1887 %endif
1888 IEM_SAVE_FLAGS A3, %2, %3
1889 EPILOGUE_4_ARGS
1890ENDPROC iemAImpl_ %+ %1 %+ _u32
1891
1892 %ifdef RT_ARCH_AMD64
1893BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1894 PROLOGUE_4_ARGS
1895 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1896 %ifdef ASM_CALL64_GCC
1897 xchg A3, A2
1898 %1 [A0], A1, cl
1899 xchg A3, A2
1900 %else
1901 xchg A0, A2
1902 %1 [A2], A1, cl
1903 %endif
1904 IEM_SAVE_FLAGS A3, %2, %3
1905 EPILOGUE_4_ARGS_EX 12
1906ENDPROC iemAImpl_ %+ %1 %+ _u64
1907 %endif ; RT_ARCH_AMD64
1908
1909%endmacro
1910
1911IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1912IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1913
1914
1915;;
1916; Macro for implementing a multiplication operations.
1917;
1918; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1919; 32-bit system where the 64-bit accesses requires hand coding.
1920;
1921; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1922; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1923; pointer to eflags in A3.
1924;
1925; The functions all return 0 so the caller can be used for div/idiv as well as
1926; for the mul/imul implementation.
1927;
1928; @param 1 The instruction mnemonic.
1929; @param 2 The modified flags.
1930; @param 3 The undefined flags.
1931; @param 4 Name suffix.
1932; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1933;
1934; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1935;
1936%macro IEMIMPL_MUL_OP 5
1937BEGINCODE
1938BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1939 PROLOGUE_3_ARGS
1940 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1941 mov al, [A0]
1942 %1 A1_8
1943 mov [A0], ax
1944 %if %5 != 1
1945 IEM_SAVE_FLAGS A2, %2, %3
1946 %else
1947 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1948 %endif
1949 xor eax, eax
1950 EPILOGUE_3_ARGS
1951ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1952
1953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1954 PROLOGUE_4_ARGS
1955 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1956 mov ax, [A0]
1957 %ifdef ASM_CALL64_GCC
1958 %1 A2_16
1959 mov [A0], ax
1960 mov [A1], dx
1961 %else
1962 mov T1, A1
1963 %1 A2_16
1964 mov [A0], ax
1965 mov [T1], dx
1966 %endif
1967 %if %5 != 1
1968 IEM_SAVE_FLAGS A3, %2, %3
1969 %else
1970 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1971 %endif
1972 xor eax, eax
1973 EPILOGUE_4_ARGS
1974ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1975
1976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1977 PROLOGUE_4_ARGS
1978 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1979 mov eax, [A0]
1980 %ifdef ASM_CALL64_GCC
1981 %1 A2_32
1982 mov [A0], eax
1983 mov [A1], edx
1984 %else
1985 mov T1, A1
1986 %1 A2_32
1987 mov [A0], eax
1988 mov [T1], edx
1989 %endif
1990 %if %5 != 1
1991 IEM_SAVE_FLAGS A3, %2, %3
1992 %else
1993 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1994 %endif
1995 xor eax, eax
1996 EPILOGUE_4_ARGS
1997ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1998
1999 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2000BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2001 PROLOGUE_4_ARGS
2002 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2003 mov rax, [A0]
2004 %ifdef ASM_CALL64_GCC
2005 %1 A2
2006 mov [A0], rax
2007 mov [A1], rdx
2008 %else
2009 mov T1, A1
2010 %1 A2
2011 mov [A0], rax
2012 mov [T1], rdx
2013 %endif
2014 %if %5 != 1
2015 IEM_SAVE_FLAGS A3, %2, %3
2016 %else
2017 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2018 %endif
2019 xor eax, eax
2020 EPILOGUE_4_ARGS_EX 12
2021ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2022 %endif ; !RT_ARCH_AMD64
2023
2024%endmacro
2025
2026IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2027IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2028IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2029IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2030IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2031IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2032
2033
2034BEGINCODE
2035;;
2036; Worker function for negating a 32-bit number in T1:T0
2037; @uses None (T0,T1)
2038BEGINPROC iemAImpl_negate_T0_T1_u32
2039 push 0
2040 push 0
2041 xchg T0_32, [xSP]
2042 xchg T1_32, [xSP + xCB]
2043 sub T0_32, [xSP]
2044 sbb T1_32, [xSP + xCB]
2045 add xSP, xCB*2
2046 ret
2047ENDPROC iemAImpl_negate_T0_T1_u32
2048
2049%ifdef RT_ARCH_AMD64
2050;;
2051; Worker function for negating a 64-bit number in T1:T0
2052; @uses None (T0,T1)
2053BEGINPROC iemAImpl_negate_T0_T1_u64
2054 push 0
2055 push 0
2056 xchg T0, [xSP]
2057 xchg T1, [xSP + xCB]
2058 sub T0, [xSP]
2059 sbb T1, [xSP + xCB]
2060 add xSP, xCB*2
2061 ret
2062ENDPROC iemAImpl_negate_T0_T1_u64
2063%endif
2064
2065
2066;;
2067; Macro for implementing a division operations.
2068;
2069; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2070; 32-bit system where the 64-bit accesses requires hand coding.
2071;
2072; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2073; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2074; pointer to eflags in A3.
2075;
2076; The functions all return 0 on success and -1 if a divide error should be
2077; raised by the caller.
2078;
2079; @param 1 The instruction mnemonic.
2080; @param 2 The modified flags.
2081; @param 3 The undefined flags.
2082; @param 4 1 if signed, 0 if unsigned.
2083; @param 5 Function suffix.
2084; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2085; 2 for AMD (set AF, clear PF, ZF and SF).
2086;
2087; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2088;
2089%macro IEMIMPL_DIV_OP 6
2090BEGINCODE
2091BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2092 PROLOGUE_3_ARGS
2093
2094 ; div by chainsaw check.
2095 test A1_8, A1_8
2096 jz .div_zero
2097
2098 ; Overflow check - unsigned division is simple to verify, haven't
2099 ; found a simple way to check signed division yet unfortunately.
2100 %if %4 == 0
2101 cmp [A0 + 1], A1_8
2102 jae .div_overflow
2103 %else
2104 mov T0_16, [A0] ; T0 = dividend
2105 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2106 test A1_8, A1_8
2107 js .divisor_negative
2108 test T0_16, T0_16
2109 jns .both_positive
2110 neg T0_16
2111.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2112 push T0 ; Start off like unsigned below.
2113 shr T0_16, 7
2114 cmp T0_8, A1_8
2115 pop T0
2116 jb .div_no_overflow
2117 ja .div_overflow
2118 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2119 cmp T0_8, A1_8
2120 jae .div_overflow
2121 jmp .div_no_overflow
2122
2123.divisor_negative:
2124 neg A1_8
2125 test T0_16, T0_16
2126 jns .one_of_each
2127 neg T0_16
2128.both_positive: ; Same as unsigned shifted by sign indicator bit.
2129 shr T0_16, 7
2130 cmp T0_8, A1_8
2131 jae .div_overflow
2132.div_no_overflow:
2133 mov A1, T1 ; restore divisor
2134 %endif
2135
2136 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2137 mov ax, [A0]
2138 %1 A1_8
2139 mov [A0], ax
2140 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2141 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2142 %else
2143 IEM_SAVE_FLAGS A2, %2, %3
2144 %endif
2145 xor eax, eax
2146
2147.return:
2148 EPILOGUE_3_ARGS
2149
2150.div_zero:
2151.div_overflow:
2152 mov eax, -1
2153 jmp .return
2154ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2155
2156BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2157 PROLOGUE_4_ARGS
2158
2159 ; div by chainsaw check.
2160 test A2_16, A2_16
2161 jz .div_zero
2162
2163 ; Overflow check - unsigned division is simple to verify, haven't
2164 ; found a simple way to check signed division yet unfortunately.
2165 %if %4 == 0
2166 cmp [A1], A2_16
2167 jae .div_overflow
2168 %else
2169 mov T0_16, [A1]
2170 shl T0_32, 16
2171 mov T0_16, [A0] ; T0 = dividend
2172 mov T1, A2 ; T1 = divisor
2173 test T1_16, T1_16
2174 js .divisor_negative
2175 test T0_32, T0_32
2176 jns .both_positive
2177 neg T0_32
2178.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2179 push T0 ; Start off like unsigned below.
2180 shr T0_32, 15
2181 cmp T0_16, T1_16
2182 pop T0
2183 jb .div_no_overflow
2184 ja .div_overflow
2185 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2186 cmp T0_16, T1_16
2187 jae .div_overflow
2188 jmp .div_no_overflow
2189
2190.divisor_negative:
2191 neg T1_16
2192 test T0_32, T0_32
2193 jns .one_of_each
2194 neg T0_32
2195.both_positive: ; Same as unsigned shifted by sign indicator bit.
2196 shr T0_32, 15
2197 cmp T0_16, T1_16
2198 jae .div_overflow
2199.div_no_overflow:
2200 %endif
2201
2202 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2203 %ifdef ASM_CALL64_GCC
2204 mov T1, A2
2205 mov ax, [A0]
2206 mov dx, [A1]
2207 %1 T1_16
2208 mov [A0], ax
2209 mov [A1], dx
2210 %else
2211 mov T1, A1
2212 mov ax, [A0]
2213 mov dx, [T1]
2214 %1 A2_16
2215 mov [A0], ax
2216 mov [T1], dx
2217 %endif
2218 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2219 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2220 %else
2221 IEM_SAVE_FLAGS A3, %2, %3
2222 %endif
2223 xor eax, eax
2224
2225.return:
2226 EPILOGUE_4_ARGS
2227
2228.div_zero:
2229.div_overflow:
2230 mov eax, -1
2231 jmp .return
2232ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2233
2234BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2235 PROLOGUE_4_ARGS
2236
2237 ; div by chainsaw check.
2238 test A2_32, A2_32
2239 jz .div_zero
2240
2241 ; Overflow check - unsigned division is simple to verify, haven't
2242 ; found a simple way to check signed division yet unfortunately.
2243 %if %4 == 0
2244 cmp [A1], A2_32
2245 jae .div_overflow
2246 %else
2247 push A2 ; save A2 so we modify it (we out of regs on x86).
2248 mov T0_32, [A0] ; T0 = dividend low
2249 mov T1_32, [A1] ; T1 = dividend high
2250 test A2_32, A2_32
2251 js .divisor_negative
2252 test T1_32, T1_32
2253 jns .both_positive
2254 call NAME(iemAImpl_negate_T0_T1_u32)
2255.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2256 push T0 ; Start off like unsigned below.
2257 shl T1_32, 1
2258 shr T0_32, 31
2259 or T1_32, T0_32
2260 cmp T1_32, A2_32
2261 pop T0
2262 jb .div_no_overflow
2263 ja .div_overflow
2264 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2265 cmp T0_32, A2_32
2266 jae .div_overflow
2267 jmp .div_no_overflow
2268
2269.divisor_negative:
2270 neg A2_32
2271 test T1_32, T1_32
2272 jns .one_of_each
2273 call NAME(iemAImpl_negate_T0_T1_u32)
2274.both_positive: ; Same as unsigned shifted by sign indicator bit.
2275 shl T1_32, 1
2276 shr T0_32, 31
2277 or T1_32, T0_32
2278 cmp T1_32, A2_32
2279 jae .div_overflow
2280.div_no_overflow:
2281 pop A2
2282 %endif
2283
2284 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2285 mov eax, [A0]
2286 %ifdef ASM_CALL64_GCC
2287 mov T1, A2
2288 mov eax, [A0]
2289 mov edx, [A1]
2290 %1 T1_32
2291 mov [A0], eax
2292 mov [A1], edx
2293 %else
2294 mov T1, A1
2295 mov eax, [A0]
2296 mov edx, [T1]
2297 %1 A2_32
2298 mov [A0], eax
2299 mov [T1], edx
2300 %endif
2301 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2302 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2303 %else
2304 IEM_SAVE_FLAGS A3, %2, %3
2305 %endif
2306 xor eax, eax
2307
2308.return:
2309 EPILOGUE_4_ARGS
2310
2311.div_overflow:
2312 %if %4 != 0
2313 pop A2
2314 %endif
2315.div_zero:
2316 mov eax, -1
2317 jmp .return
2318ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2319
2320 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2321BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2322 PROLOGUE_4_ARGS
2323
2324 test A2, A2
2325 jz .div_zero
2326 %if %4 == 0
2327 cmp [A1], A2
2328 jae .div_overflow
2329 %else
2330 push A2 ; save A2 so we modify it (we out of regs on x86).
2331 mov T0, [A0] ; T0 = dividend low
2332 mov T1, [A1] ; T1 = dividend high
2333 test A2, A2
2334 js .divisor_negative
2335 test T1, T1
2336 jns .both_positive
2337 call NAME(iemAImpl_negate_T0_T1_u64)
2338.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2339 push T0 ; Start off like unsigned below.
2340 shl T1, 1
2341 shr T0, 63
2342 or T1, T0
2343 cmp T1, A2
2344 pop T0
2345 jb .div_no_overflow
2346 ja .div_overflow
2347 mov T1, 0x7fffffffffffffff
2348 and T0, T1 ; Special case for covering (divisor - 1).
2349 cmp T0, A2
2350 jae .div_overflow
2351 jmp .div_no_overflow
2352
2353.divisor_negative:
2354 neg A2
2355 test T1, T1
2356 jns .one_of_each
2357 call NAME(iemAImpl_negate_T0_T1_u64)
2358.both_positive: ; Same as unsigned shifted by sign indicator bit.
2359 shl T1, 1
2360 shr T0, 63
2361 or T1, T0
2362 cmp T1, A2
2363 jae .div_overflow
2364.div_no_overflow:
2365 pop A2
2366 %endif
2367
2368 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2369 mov rax, [A0]
2370 %ifdef ASM_CALL64_GCC
2371 mov T1, A2
2372 mov rax, [A0]
2373 mov rdx, [A1]
2374 %1 T1
2375 mov [A0], rax
2376 mov [A1], rdx
2377 %else
2378 mov T1, A1
2379 mov rax, [A0]
2380 mov rdx, [T1]
2381 %1 A2
2382 mov [A0], rax
2383 mov [T1], rdx
2384 %endif
2385 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2386 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2387 %else
2388 IEM_SAVE_FLAGS A3, %2, %3
2389 %endif
2390 xor eax, eax
2391
2392.return:
2393 EPILOGUE_4_ARGS_EX 12
2394
2395.div_overflow:
2396 %if %4 != 0
2397 pop A2
2398 %endif
2399.div_zero:
2400 mov eax, -1
2401 jmp .return
2402ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2403 %endif ; !RT_ARCH_AMD64
2404
2405%endmacro
2406
2407IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2408IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2409IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2410IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2411IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2412IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2413
2414
2415;;
2416; Macro for implementing memory fence operation.
2417;
2418; No return value, no operands or anything.
2419;
2420; @param 1 The instruction.
2421;
2422%macro IEMIMPL_MEM_FENCE 1
2423BEGINCODE
2424BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2425 %1
2426 ret
2427ENDPROC iemAImpl_ %+ %1
2428%endmacro
2429
2430IEMIMPL_MEM_FENCE lfence
2431IEMIMPL_MEM_FENCE sfence
2432IEMIMPL_MEM_FENCE mfence
2433
2434;;
2435; Alternative for non-SSE2 host.
2436;
2437BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2438 push xAX
2439 xchg xAX, [xSP]
2440 add xSP, xCB
2441 ret
2442ENDPROC iemAImpl_alt_mem_fence
2443
2444
2445;;
2446; Initialize the FPU for the actual instruction being emulated, this means
2447; loading parts of the guest's control word and status word.
2448;
2449; @uses 24 bytes of stack. T0, T1
2450; @param 1 Expression giving the address of the FXSTATE of the guest.
2451;
2452%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2453 fnstenv [xSP]
2454
2455 ; FCW - for exception, precision and rounding control.
2456 movzx T0, word [%1 + X86FXSTATE.FCW]
2457 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2458 mov [xSP + X86FSTENV32P.FCW], T0_16
2459
2460 ; FSW - for undefined C0, C1, C2, and C3.
2461 movzx T1, word [%1 + X86FXSTATE.FSW]
2462 and T1, X86_FSW_C_MASK
2463 movzx T0, word [xSP + X86FSTENV32P.FSW]
2464 and T0, X86_FSW_TOP_MASK
2465 or T0, T1
2466 mov [xSP + X86FSTENV32P.FSW], T0_16
2467
2468 fldenv [xSP]
2469%endmacro
2470
2471
2472;;
2473; Initialize the FPU for the actual instruction being emulated, this means
2474; loading parts of the guest's control word, status word, and update the
2475; tag word for the top register if it's empty.
2476;
2477; ASSUMES actual TOP=7
2478;
2479; @uses 24 bytes of stack. T0, T1
2480; @param 1 Expression giving the address of the FXSTATE of the guest.
2481;
2482%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2483 fnstenv [xSP]
2484
2485 ; FCW - for exception, precision and rounding control.
2486 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2487 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2488 mov [xSP + X86FSTENV32P.FCW], T0_16
2489
2490 ; FSW - for undefined C0, C1, C2, and C3.
2491 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2492 and T1_32, X86_FSW_C_MASK
2493 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2494 and T0_32, X86_FSW_TOP_MASK
2495 or T0_32, T1_32
2496 mov [xSP + X86FSTENV32P.FSW], T0_16
2497
2498 ; FTW - Only for ST0 (in/out).
2499 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2500 shr T1_32, X86_FSW_TOP_SHIFT
2501 and T1_32, X86_FSW_TOP_SMASK
2502 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2503 jc %%st0_not_empty
2504 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2505%%st0_not_empty:
2506
2507 fldenv [xSP]
2508%endmacro
2509
2510
2511;;
2512; Need to move this as well somewhere better?
2513;
2514struc IEMFPURESULT
2515 .r80Result resw 5
2516 .FSW resw 1
2517endstruc
2518
2519
2520;;
2521; Need to move this as well somewhere better?
2522;
2523struc IEMFPURESULTTWO
2524 .r80Result1 resw 5
2525 .FSW resw 1
2526 .r80Result2 resw 5
2527endstruc
2528
2529
2530;
2531;---------------------- 16-bit signed integer operations ----------------------
2532;
2533
2534
2535;;
2536; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2537;
2538; @param A0 FPU context (fxsave).
2539; @param A1 Pointer to a IEMFPURESULT for the output.
2540; @param A2 Pointer to the 16-bit floating point value to convert.
2541;
2542BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2543 PROLOGUE_3_ARGS
2544 sub xSP, 20h
2545
2546 fninit
2547 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2548 fild word [A2]
2549
2550 fnstsw word [A1 + IEMFPURESULT.FSW]
2551 fnclex
2552 fstp tword [A1 + IEMFPURESULT.r80Result]
2553
2554 fninit
2555 add xSP, 20h
2556 EPILOGUE_3_ARGS
2557ENDPROC iemAImpl_fild_r80_from_i16
2558
2559
2560;;
2561; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2562;
2563; @param A0 FPU context (fxsave).
2564; @param A1 Where to return the output FSW.
2565; @param A2 Where to store the 16-bit signed integer value.
2566; @param A3 Pointer to the 80-bit value.
2567;
2568BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2569 PROLOGUE_4_ARGS
2570 sub xSP, 20h
2571
2572 fninit
2573 fld tword [A3]
2574 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2575 fistp word [A2]
2576
2577 fnstsw word [A1]
2578
2579 fninit
2580 add xSP, 20h
2581 EPILOGUE_4_ARGS
2582ENDPROC iemAImpl_fist_r80_to_i16
2583
2584
2585;;
2586; Store a 80-bit floating point value (register) as a 16-bit signed integer
2587; (memory) with truncation.
2588;
2589; @param A0 FPU context (fxsave).
2590; @param A1 Where to return the output FSW.
2591; @param A2 Where to store the 16-bit signed integer value.
2592; @param A3 Pointer to the 80-bit value.
2593;
2594BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2595 PROLOGUE_4_ARGS
2596 sub xSP, 20h
2597
2598 fninit
2599 fld tword [A3]
2600 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2601 fisttp word [A2]
2602
2603 fnstsw word [A1]
2604
2605 fninit
2606 add xSP, 20h
2607 EPILOGUE_4_ARGS
2608ENDPROC iemAImpl_fistt_r80_to_i16
2609
2610
2611;;
2612; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2613;
2614; @param 1 The instruction
2615;
2616; @param A0 FPU context (fxsave).
2617; @param A1 Pointer to a IEMFPURESULT for the output.
2618; @param A2 Pointer to the 80-bit value.
2619; @param A3 Pointer to the 16-bit value.
2620;
2621%macro IEMIMPL_FPU_R80_BY_I16 1
2622BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2623 PROLOGUE_4_ARGS
2624 sub xSP, 20h
2625
2626 fninit
2627 fld tword [A2]
2628 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2629 %1 word [A3]
2630
2631 fnstsw word [A1 + IEMFPURESULT.FSW]
2632 fnclex
2633 fstp tword [A1 + IEMFPURESULT.r80Result]
2634
2635 fninit
2636 add xSP, 20h
2637 EPILOGUE_4_ARGS
2638ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2639%endmacro
2640
2641IEMIMPL_FPU_R80_BY_I16 fiadd
2642IEMIMPL_FPU_R80_BY_I16 fimul
2643IEMIMPL_FPU_R80_BY_I16 fisub
2644IEMIMPL_FPU_R80_BY_I16 fisubr
2645IEMIMPL_FPU_R80_BY_I16 fidiv
2646IEMIMPL_FPU_R80_BY_I16 fidivr
2647
2648
2649;;
2650; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2651; only returning FSW.
2652;
2653; @param 1 The instruction
2654;
2655; @param A0 FPU context (fxsave).
2656; @param A1 Where to store the output FSW.
2657; @param A2 Pointer to the 80-bit value.
2658; @param A3 Pointer to the 64-bit value.
2659;
2660%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2662 PROLOGUE_4_ARGS
2663 sub xSP, 20h
2664
2665 fninit
2666 fld tword [A2]
2667 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2668 %1 word [A3]
2669
2670 fnstsw word [A1]
2671
2672 fninit
2673 add xSP, 20h
2674 EPILOGUE_4_ARGS
2675ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2676%endmacro
2677
2678IEMIMPL_FPU_R80_BY_I16_FSW ficom
2679
2680
2681
2682;
2683;---------------------- 32-bit signed integer operations ----------------------
2684;
2685
2686
2687;;
2688; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2689;
2690; @param A0 FPU context (fxsave).
2691; @param A1 Pointer to a IEMFPURESULT for the output.
2692; @param A2 Pointer to the 32-bit floating point value to convert.
2693;
2694BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2695 PROLOGUE_3_ARGS
2696 sub xSP, 20h
2697
2698 fninit
2699 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700 fild dword [A2]
2701
2702 fnstsw word [A1 + IEMFPURESULT.FSW]
2703 fnclex
2704 fstp tword [A1 + IEMFPURESULT.r80Result]
2705
2706 fninit
2707 add xSP, 20h
2708 EPILOGUE_3_ARGS
2709ENDPROC iemAImpl_fild_r80_from_i32
2710
2711
2712;;
2713; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2714;
2715; @param A0 FPU context (fxsave).
2716; @param A1 Where to return the output FSW.
2717; @param A2 Where to store the 32-bit signed integer value.
2718; @param A3 Pointer to the 80-bit value.
2719;
2720BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2721 PROLOGUE_4_ARGS
2722 sub xSP, 20h
2723
2724 fninit
2725 fld tword [A3]
2726 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2727 fistp dword [A2]
2728
2729 fnstsw word [A1]
2730
2731 fninit
2732 add xSP, 20h
2733 EPILOGUE_4_ARGS
2734ENDPROC iemAImpl_fist_r80_to_i32
2735
2736
2737;;
2738; Store a 80-bit floating point value (register) as a 32-bit signed integer
2739; (memory) with truncation.
2740;
2741; @param A0 FPU context (fxsave).
2742; @param A1 Where to return the output FSW.
2743; @param A2 Where to store the 32-bit signed integer value.
2744; @param A3 Pointer to the 80-bit value.
2745;
2746BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2747 PROLOGUE_4_ARGS
2748 sub xSP, 20h
2749
2750 fninit
2751 fld tword [A3]
2752 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2753 fisttp dword [A2]
2754
2755 fnstsw word [A1]
2756
2757 fninit
2758 add xSP, 20h
2759 EPILOGUE_4_ARGS
2760ENDPROC iemAImpl_fistt_r80_to_i32
2761
2762
2763;;
2764; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2765;
2766; @param 1 The instruction
2767;
2768; @param A0 FPU context (fxsave).
2769; @param A1 Pointer to a IEMFPURESULT for the output.
2770; @param A2 Pointer to the 80-bit value.
2771; @param A3 Pointer to the 32-bit value.
2772;
2773%macro IEMIMPL_FPU_R80_BY_I32 1
2774BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2775 PROLOGUE_4_ARGS
2776 sub xSP, 20h
2777
2778 fninit
2779 fld tword [A2]
2780 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2781 %1 dword [A3]
2782
2783 fnstsw word [A1 + IEMFPURESULT.FSW]
2784 fnclex
2785 fstp tword [A1 + IEMFPURESULT.r80Result]
2786
2787 fninit
2788 add xSP, 20h
2789 EPILOGUE_4_ARGS
2790ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2791%endmacro
2792
2793IEMIMPL_FPU_R80_BY_I32 fiadd
2794IEMIMPL_FPU_R80_BY_I32 fimul
2795IEMIMPL_FPU_R80_BY_I32 fisub
2796IEMIMPL_FPU_R80_BY_I32 fisubr
2797IEMIMPL_FPU_R80_BY_I32 fidiv
2798IEMIMPL_FPU_R80_BY_I32 fidivr
2799
2800
2801;;
2802; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2803; only returning FSW.
2804;
2805; @param 1 The instruction
2806;
2807; @param A0 FPU context (fxsave).
2808; @param A1 Where to store the output FSW.
2809; @param A2 Pointer to the 80-bit value.
2810; @param A3 Pointer to the 64-bit value.
2811;
2812%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2814 PROLOGUE_4_ARGS
2815 sub xSP, 20h
2816
2817 fninit
2818 fld tword [A2]
2819 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2820 %1 dword [A3]
2821
2822 fnstsw word [A1]
2823
2824 fninit
2825 add xSP, 20h
2826 EPILOGUE_4_ARGS
2827ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2828%endmacro
2829
2830IEMIMPL_FPU_R80_BY_I32_FSW ficom
2831
2832
2833
2834;
2835;---------------------- 64-bit signed integer operations ----------------------
2836;
2837
2838
2839;;
2840; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2841;
2842; @param A0 FPU context (fxsave).
2843; @param A1 Pointer to a IEMFPURESULT for the output.
2844; @param A2 Pointer to the 64-bit floating point value to convert.
2845;
2846BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2847 PROLOGUE_3_ARGS
2848 sub xSP, 20h
2849
2850 fninit
2851 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2852 fild qword [A2]
2853
2854 fnstsw word [A1 + IEMFPURESULT.FSW]
2855 fnclex
2856 fstp tword [A1 + IEMFPURESULT.r80Result]
2857
2858 fninit
2859 add xSP, 20h
2860 EPILOGUE_3_ARGS
2861ENDPROC iemAImpl_fild_r80_from_i64
2862
2863
2864;;
2865; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2866;
2867; @param A0 FPU context (fxsave).
2868; @param A1 Where to return the output FSW.
2869; @param A2 Where to store the 64-bit signed integer value.
2870; @param A3 Pointer to the 80-bit value.
2871;
2872BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2873 PROLOGUE_4_ARGS
2874 sub xSP, 20h
2875
2876 fninit
2877 fld tword [A3]
2878 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2879 fistp qword [A2]
2880
2881 fnstsw word [A1]
2882
2883 fninit
2884 add xSP, 20h
2885 EPILOGUE_4_ARGS
2886ENDPROC iemAImpl_fist_r80_to_i64
2887
2888
2889;;
2890; Store a 80-bit floating point value (register) as a 64-bit signed integer
2891; (memory) with truncation.
2892;
2893; @param A0 FPU context (fxsave).
2894; @param A1 Where to return the output FSW.
2895; @param A2 Where to store the 64-bit signed integer value.
2896; @param A3 Pointer to the 80-bit value.
2897;
2898BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2899 PROLOGUE_4_ARGS
2900 sub xSP, 20h
2901
2902 fninit
2903 fld tword [A3]
2904 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2905 fisttp qword [A2]
2906
2907 fnstsw word [A1]
2908
2909 fninit
2910 add xSP, 20h
2911 EPILOGUE_4_ARGS
2912ENDPROC iemAImpl_fistt_r80_to_i64
2913
2914
2915
2916;
2917;---------------------- 32-bit floating point operations ----------------------
2918;
2919
2920;;
2921; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2922;
2923; @param A0 FPU context (fxsave).
2924; @param A1 Pointer to a IEMFPURESULT for the output.
2925; @param A2 Pointer to the 32-bit floating point value to convert.
2926;
2927BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2928 PROLOGUE_3_ARGS
2929 sub xSP, 20h
2930
2931 fninit
2932 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2933 fld dword [A2]
2934
2935 fnstsw word [A1 + IEMFPURESULT.FSW]
2936 fnclex
2937 fstp tword [A1 + IEMFPURESULT.r80Result]
2938
2939 fninit
2940 add xSP, 20h
2941 EPILOGUE_3_ARGS
2942ENDPROC iemAImpl_fld_r80_from_r32
2943
2944
2945;;
2946; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2947;
2948; @param A0 FPU context (fxsave).
2949; @param A1 Where to return the output FSW.
2950; @param A2 Where to store the 32-bit value.
2951; @param A3 Pointer to the 80-bit value.
2952;
2953BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2954 PROLOGUE_4_ARGS
2955 sub xSP, 20h
2956
2957 fninit
2958 fld tword [A3]
2959 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2960 fst dword [A2]
2961
2962 fnstsw word [A1]
2963
2964 fninit
2965 add xSP, 20h
2966 EPILOGUE_4_ARGS
2967ENDPROC iemAImpl_fst_r80_to_r32
2968
2969
2970;;
2971; FPU instruction working on one 80-bit and one 32-bit floating point value.
2972;
2973; @param 1 The instruction
2974;
2975; @param A0 FPU context (fxsave).
2976; @param A1 Pointer to a IEMFPURESULT for the output.
2977; @param A2 Pointer to the 80-bit value.
2978; @param A3 Pointer to the 32-bit value.
2979;
2980%macro IEMIMPL_FPU_R80_BY_R32 1
2981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2982 PROLOGUE_4_ARGS
2983 sub xSP, 20h
2984
2985 fninit
2986 fld tword [A2]
2987 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2988 %1 dword [A3]
2989
2990 fnstsw word [A1 + IEMFPURESULT.FSW]
2991 fnclex
2992 fstp tword [A1 + IEMFPURESULT.r80Result]
2993
2994 fninit
2995 add xSP, 20h
2996 EPILOGUE_4_ARGS
2997ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2998%endmacro
2999
3000IEMIMPL_FPU_R80_BY_R32 fadd
3001IEMIMPL_FPU_R80_BY_R32 fmul
3002IEMIMPL_FPU_R80_BY_R32 fsub
3003IEMIMPL_FPU_R80_BY_R32 fsubr
3004IEMIMPL_FPU_R80_BY_R32 fdiv
3005IEMIMPL_FPU_R80_BY_R32 fdivr
3006
3007
3008;;
3009; FPU instruction working on one 80-bit and one 32-bit floating point value,
3010; only returning FSW.
3011;
3012; @param 1 The instruction
3013;
3014; @param A0 FPU context (fxsave).
3015; @param A1 Where to store the output FSW.
3016; @param A2 Pointer to the 80-bit value.
3017; @param A3 Pointer to the 64-bit value.
3018;
3019%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3020BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3021 PROLOGUE_4_ARGS
3022 sub xSP, 20h
3023
3024 fninit
3025 fld tword [A2]
3026 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3027 %1 dword [A3]
3028
3029 fnstsw word [A1]
3030
3031 fninit
3032 add xSP, 20h
3033 EPILOGUE_4_ARGS
3034ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3035%endmacro
3036
3037IEMIMPL_FPU_R80_BY_R32_FSW fcom
3038
3039
3040
3041;
3042;---------------------- 64-bit floating point operations ----------------------
3043;
3044
3045;;
3046; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3047;
3048; @param A0 FPU context (fxsave).
3049; @param A1 Pointer to a IEMFPURESULT for the output.
3050; @param A2 Pointer to the 64-bit floating point value to convert.
3051;
3052BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3053 PROLOGUE_3_ARGS
3054 sub xSP, 20h
3055
3056 fninit
3057 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3058 fld qword [A2]
3059
3060 fnstsw word [A1 + IEMFPURESULT.FSW]
3061 fnclex
3062 fstp tword [A1 + IEMFPURESULT.r80Result]
3063
3064 fninit
3065 add xSP, 20h
3066 EPILOGUE_3_ARGS
3067ENDPROC iemAImpl_fld_r80_from_r64
3068
3069
3070;;
3071; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3072;
3073; @param A0 FPU context (fxsave).
3074; @param A1 Where to return the output FSW.
3075; @param A2 Where to store the 64-bit value.
3076; @param A3 Pointer to the 80-bit value.
3077;
3078BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3079 PROLOGUE_4_ARGS
3080 sub xSP, 20h
3081
3082 fninit
3083 fld tword [A3]
3084 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3085 fst qword [A2]
3086
3087 fnstsw word [A1]
3088
3089 fninit
3090 add xSP, 20h
3091 EPILOGUE_4_ARGS
3092ENDPROC iemAImpl_fst_r80_to_r64
3093
3094
3095;;
3096; FPU instruction working on one 80-bit and one 64-bit floating point value.
3097;
3098; @param 1 The instruction
3099;
3100; @param A0 FPU context (fxsave).
3101; @param A1 Pointer to a IEMFPURESULT for the output.
3102; @param A2 Pointer to the 80-bit value.
3103; @param A3 Pointer to the 64-bit value.
3104;
3105%macro IEMIMPL_FPU_R80_BY_R64 1
3106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3107 PROLOGUE_4_ARGS
3108 sub xSP, 20h
3109
3110 fninit
3111 fld tword [A2]
3112 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3113 %1 qword [A3]
3114
3115 fnstsw word [A1 + IEMFPURESULT.FSW]
3116 fnclex
3117 fstp tword [A1 + IEMFPURESULT.r80Result]
3118
3119 fninit
3120 add xSP, 20h
3121 EPILOGUE_4_ARGS
3122ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3123%endmacro
3124
3125IEMIMPL_FPU_R80_BY_R64 fadd
3126IEMIMPL_FPU_R80_BY_R64 fmul
3127IEMIMPL_FPU_R80_BY_R64 fsub
3128IEMIMPL_FPU_R80_BY_R64 fsubr
3129IEMIMPL_FPU_R80_BY_R64 fdiv
3130IEMIMPL_FPU_R80_BY_R64 fdivr
3131
3132;;
3133; FPU instruction working on one 80-bit and one 64-bit floating point value,
3134; only returning FSW.
3135;
3136; @param 1 The instruction
3137;
3138; @param A0 FPU context (fxsave).
3139; @param A1 Where to store the output FSW.
3140; @param A2 Pointer to the 80-bit value.
3141; @param A3 Pointer to the 64-bit value.
3142;
3143%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3144BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3145 PROLOGUE_4_ARGS
3146 sub xSP, 20h
3147
3148 fninit
3149 fld tword [A2]
3150 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3151 %1 qword [A3]
3152
3153 fnstsw word [A1]
3154
3155 fninit
3156 add xSP, 20h
3157 EPILOGUE_4_ARGS
3158ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3159%endmacro
3160
3161IEMIMPL_FPU_R80_BY_R64_FSW fcom
3162
3163
3164
3165;
3166;---------------------- 80-bit floating point operations ----------------------
3167;
3168
3169;;
3170; Loads a 80-bit floating point register value from memory.
3171;
3172; @param A0 FPU context (fxsave).
3173; @param A1 Pointer to a IEMFPURESULT for the output.
3174; @param A2 Pointer to the 80-bit floating point value to load.
3175;
3176BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3177 PROLOGUE_3_ARGS
3178 sub xSP, 20h
3179
3180 fninit
3181 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3182 fld tword [A2]
3183
3184 fnstsw word [A1 + IEMFPURESULT.FSW]
3185 fnclex
3186 fstp tword [A1 + IEMFPURESULT.r80Result]
3187
3188 fninit
3189 add xSP, 20h
3190 EPILOGUE_3_ARGS
3191ENDPROC iemAImpl_fld_r80_from_r80
3192
3193
3194;;
3195; Store a 80-bit floating point register to memory
3196;
3197; @param A0 FPU context (fxsave).
3198; @param A1 Where to return the output FSW.
3199; @param A2 Where to store the 80-bit value.
3200; @param A3 Pointer to the 80-bit register value.
3201;
3202BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3203 PROLOGUE_4_ARGS
3204 sub xSP, 20h
3205
3206 fninit
3207 fld tword [A3]
3208 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3209 fstp tword [A2]
3210
3211 fnstsw word [A1]
3212
3213 fninit
3214 add xSP, 20h
3215 EPILOGUE_4_ARGS
3216ENDPROC iemAImpl_fst_r80_to_r80
3217
3218
3219;;
3220; Loads an 80-bit floating point register value in BCD format from memory.
3221;
3222; @param A0 FPU context (fxsave).
3223; @param A1 Pointer to a IEMFPURESULT for the output.
3224; @param A2 Pointer to the 80-bit BCD value to load.
3225;
3226BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3227 PROLOGUE_3_ARGS
3228 sub xSP, 20h
3229
3230 fninit
3231 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3232 fbld tword [A2]
3233
3234 fnstsw word [A1 + IEMFPURESULT.FSW]
3235 fnclex
3236 fstp tword [A1 + IEMFPURESULT.r80Result]
3237
3238 fninit
3239 add xSP, 20h
3240 EPILOGUE_3_ARGS
3241ENDPROC iemAImpl_fld_r80_from_d80
3242
3243
3244;;
3245; Store a 80-bit floating point register to memory as BCD
3246;
3247; @param A0 FPU context (fxsave).
3248; @param A1 Where to return the output FSW.
3249; @param A2 Where to store the 80-bit BCD value.
3250; @param A3 Pointer to the 80-bit register value.
3251;
3252BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3253 PROLOGUE_4_ARGS
3254 sub xSP, 20h
3255
3256 fninit
3257 fld tword [A3]
3258 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3259 fbstp tword [A2]
3260
3261 fnstsw word [A1]
3262
3263 fninit
3264 add xSP, 20h
3265 EPILOGUE_4_ARGS
3266ENDPROC iemAImpl_fst_r80_to_d80
3267
3268
3269;;
3270; FPU instruction working on two 80-bit floating point values.
3271;
3272; @param 1 The instruction
3273;
3274; @param A0 FPU context (fxsave).
3275; @param A1 Pointer to a IEMFPURESULT for the output.
3276; @param A2 Pointer to the first 80-bit value (ST0)
3277; @param A3 Pointer to the second 80-bit value (STn).
3278;
3279%macro IEMIMPL_FPU_R80_BY_R80 2
3280BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3281 PROLOGUE_4_ARGS
3282 sub xSP, 20h
3283
3284 fninit
3285 fld tword [A3]
3286 fld tword [A2]
3287 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3288 %1 %2
3289
3290 fnstsw word [A1 + IEMFPURESULT.FSW]
3291 fnclex
3292 fstp tword [A1 + IEMFPURESULT.r80Result]
3293
3294 fninit
3295 add xSP, 20h
3296 EPILOGUE_4_ARGS
3297ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3298%endmacro
3299
3300IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3301IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3302IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3303IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3304IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3305IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3306IEMIMPL_FPU_R80_BY_R80 fprem, {}
3307IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3308IEMIMPL_FPU_R80_BY_R80 fscale, {}
3309
3310
3311;;
3312; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3313; storing the result in ST1 and popping the stack.
3314;
3315; @param 1 The instruction
3316;
3317; @param A0 FPU context (fxsave).
3318; @param A1 Pointer to a IEMFPURESULT for the output.
3319; @param A2 Pointer to the first 80-bit value (ST1).
3320; @param A3 Pointer to the second 80-bit value (ST0).
3321;
3322%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3323BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3324 PROLOGUE_4_ARGS
3325 sub xSP, 20h
3326
3327 fninit
3328 fld tword [A2]
3329 fld tword [A3]
3330 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3331 %1
3332
3333 fnstsw word [A1 + IEMFPURESULT.FSW]
3334 fnclex
3335 fstp tword [A1 + IEMFPURESULT.r80Result]
3336
3337 fninit
3338 add xSP, 20h
3339 EPILOGUE_4_ARGS
3340ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3341%endmacro
3342
3343IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3344IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3345IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3346
3347
3348;;
3349; FPU instruction working on two 80-bit floating point values, only
3350; returning FSW.
3351;
3352; @param 1 The instruction
3353;
3354; @param A0 FPU context (fxsave).
3355; @param A1 Pointer to a uint16_t for the resulting FSW.
3356; @param A2 Pointer to the first 80-bit value.
3357; @param A3 Pointer to the second 80-bit value.
3358;
3359%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3360BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3361 PROLOGUE_4_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 fld tword [A3]
3366 fld tword [A2]
3367 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3368 %1 st0, st1
3369
3370 fnstsw word [A1]
3371
3372 fninit
3373 add xSP, 20h
3374 EPILOGUE_4_ARGS
3375ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3376%endmacro
3377
3378IEMIMPL_FPU_R80_BY_R80_FSW fcom
3379IEMIMPL_FPU_R80_BY_R80_FSW fucom
3380
3381
3382;;
3383; FPU instruction working on two 80-bit floating point values,
3384; returning FSW and EFLAGS (eax).
3385;
3386; @param 1 The instruction
3387;
3388; @returns EFLAGS in EAX.
3389; @param A0 FPU context (fxsave).
3390; @param A1 Pointer to a uint16_t for the resulting FSW.
3391; @param A2 Pointer to the first 80-bit value.
3392; @param A3 Pointer to the second 80-bit value.
3393;
3394%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3395BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3396 PROLOGUE_4_ARGS
3397 sub xSP, 20h
3398
3399 fninit
3400 fld tword [A3]
3401 fld tword [A2]
3402 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3403 %1 st1
3404
3405 fnstsw word [A1]
3406 pushf
3407 pop xAX
3408
3409 fninit
3410 add xSP, 20h
3411 EPILOGUE_4_ARGS
3412ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3413%endmacro
3414
3415IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3416IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3417
3418
3419;;
3420; FPU instruction working on one 80-bit floating point value.
3421;
3422; @param 1 The instruction
3423;
3424; @param A0 FPU context (fxsave).
3425; @param A1 Pointer to a IEMFPURESULT for the output.
3426; @param A2 Pointer to the 80-bit value.
3427;
3428%macro IEMIMPL_FPU_R80 1
3429BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3430 PROLOGUE_3_ARGS
3431 sub xSP, 20h
3432
3433 fninit
3434 fld tword [A2]
3435 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3436 %1
3437
3438 fnstsw word [A1 + IEMFPURESULT.FSW]
3439 fnclex
3440 fstp tword [A1 + IEMFPURESULT.r80Result]
3441
3442 fninit
3443 add xSP, 20h
3444 EPILOGUE_3_ARGS
3445ENDPROC iemAImpl_ %+ %1 %+ _r80
3446%endmacro
3447
3448IEMIMPL_FPU_R80 fchs
3449IEMIMPL_FPU_R80 fabs
3450IEMIMPL_FPU_R80 f2xm1
3451IEMIMPL_FPU_R80 fsqrt
3452IEMIMPL_FPU_R80 frndint
3453IEMIMPL_FPU_R80 fsin
3454IEMIMPL_FPU_R80 fcos
3455
3456
3457;;
3458; FPU instruction working on one 80-bit floating point value, only
3459; returning FSW.
3460;
3461; @param 1 The instruction
3462; @param 2 Non-zero to also restore FTW.
3463;
3464; @param A0 FPU context (fxsave).
3465; @param A1 Pointer to a uint16_t for the resulting FSW.
3466; @param A2 Pointer to the 80-bit value.
3467;
3468%macro IEMIMPL_FPU_R80_FSW 2
3469BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3470 PROLOGUE_3_ARGS
3471 sub xSP, 20h
3472
3473 fninit
3474 fld tword [A2]
3475%if %2 != 0
3476 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3477%else
3478 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3479%endif
3480 %1
3481
3482 fnstsw word [A1]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_ %+ %1 %+ _r80
3488%endmacro
3489
3490IEMIMPL_FPU_R80_FSW ftst, 0
3491IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3492
3493
3494
3495;;
3496; FPU instruction loading a 80-bit floating point constant.
3497;
3498; @param 1 The instruction
3499;
3500; @param A0 FPU context (fxsave).
3501; @param A1 Pointer to a IEMFPURESULT for the output.
3502;
3503%macro IEMIMPL_FPU_R80_CONST 1
3504BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3505 PROLOGUE_2_ARGS
3506 sub xSP, 20h
3507
3508 fninit
3509 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3510 %1
3511
3512 fnstsw word [A1 + IEMFPURESULT.FSW]
3513 fnclex
3514 fstp tword [A1 + IEMFPURESULT.r80Result]
3515
3516 fninit
3517 add xSP, 20h
3518 EPILOGUE_2_ARGS
3519ENDPROC iemAImpl_ %+ %1 %+
3520%endmacro
3521
3522IEMIMPL_FPU_R80_CONST fld1
3523IEMIMPL_FPU_R80_CONST fldl2t
3524IEMIMPL_FPU_R80_CONST fldl2e
3525IEMIMPL_FPU_R80_CONST fldpi
3526IEMIMPL_FPU_R80_CONST fldlg2
3527IEMIMPL_FPU_R80_CONST fldln2
3528IEMIMPL_FPU_R80_CONST fldz
3529
3530
3531;;
3532; FPU instruction working on one 80-bit floating point value, outputing two.
3533;
3534; @param 1 The instruction
3535;
3536; @param A0 FPU context (fxsave).
3537; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3538; @param A2 Pointer to the 80-bit value.
3539;
3540%macro IEMIMPL_FPU_R80_R80 1
3541BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3542 PROLOGUE_3_ARGS
3543 sub xSP, 20h
3544
3545 fninit
3546 fld tword [A2]
3547 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3548 %1
3549
3550 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3551 fnclex
3552 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3553 fnclex
3554 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3555
3556 fninit
3557 add xSP, 20h
3558 EPILOGUE_3_ARGS
3559ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3560%endmacro
3561
3562IEMIMPL_FPU_R80_R80 fptan
3563IEMIMPL_FPU_R80_R80 fxtract
3564IEMIMPL_FPU_R80_R80 fsincos
3565
3566
3567
3568
3569;---------------------- SSE and MMX Operations ----------------------
3570
3571;; @todo what do we need to do for MMX?
3572%macro IEMIMPL_MMX_PROLOGUE 0
3573%endmacro
3574%macro IEMIMPL_MMX_EPILOGUE 0
3575%endmacro
3576
3577;; @todo what do we need to do for SSE?
3578%macro IEMIMPL_SSE_PROLOGUE 0
3579%endmacro
3580%macro IEMIMPL_SSE_EPILOGUE 0
3581%endmacro
3582
3583;; @todo what do we need to do for AVX?
3584%macro IEMIMPL_AVX_PROLOGUE 0
3585%endmacro
3586%macro IEMIMPL_AVX_EPILOGUE 0
3587%endmacro
3588
3589
3590;;
3591; Media instruction working on two full sized registers.
3592;
3593; @param 1 The instruction
3594; @param 2 Whether there is an MMX variant (1) or not (0).
3595;
3596; @param A0 FPU context (fxsave).
3597; @param A1 Pointer to the first media register size operand (input/output).
3598; @param A2 Pointer to the second media register size operand (input).
3599;
3600%macro IEMIMPL_MEDIA_F2 2
3601%if %2 != 0
3602BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3603 PROLOGUE_3_ARGS
3604 IEMIMPL_MMX_PROLOGUE
3605
3606 movq mm0, [A1]
3607 movq mm1, [A2]
3608 %1 mm0, mm1
3609 movq [A1], mm0
3610
3611 IEMIMPL_MMX_EPILOGUE
3612 EPILOGUE_3_ARGS
3613ENDPROC iemAImpl_ %+ %1 %+ _u64
3614%endif
3615
3616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3617 PROLOGUE_3_ARGS
3618 IEMIMPL_SSE_PROLOGUE
3619
3620 movdqu xmm0, [A1]
3621 movdqu xmm1, [A2]
3622 %1 xmm0, xmm1
3623 movdqu [A1], xmm0
3624
3625 IEMIMPL_SSE_EPILOGUE
3626 EPILOGUE_3_ARGS
3627ENDPROC iemAImpl_ %+ %1 %+ _u128
3628%endmacro
3629
3630IEMIMPL_MEDIA_F2 pshufb, 1
3631IEMIMPL_MEDIA_F2 pand, 1
3632IEMIMPL_MEDIA_F2 pandn, 1
3633IEMIMPL_MEDIA_F2 por, 1
3634IEMIMPL_MEDIA_F2 pxor, 1
3635IEMIMPL_MEDIA_F2 pcmpeqb, 1
3636IEMIMPL_MEDIA_F2 pcmpeqw, 1
3637IEMIMPL_MEDIA_F2 pcmpeqd, 1
3638IEMIMPL_MEDIA_F2 pcmpeqq, 0
3639IEMIMPL_MEDIA_F2 pcmpgtb, 1
3640IEMIMPL_MEDIA_F2 pcmpgtw, 1
3641IEMIMPL_MEDIA_F2 pcmpgtd, 1
3642IEMIMPL_MEDIA_F2 pcmpgtq, 0
3643IEMIMPL_MEDIA_F2 paddb, 1
3644IEMIMPL_MEDIA_F2 paddw, 1
3645IEMIMPL_MEDIA_F2 paddd, 1
3646IEMIMPL_MEDIA_F2 paddq, 1
3647IEMIMPL_MEDIA_F2 paddsb, 1
3648IEMIMPL_MEDIA_F2 paddsw, 1
3649IEMIMPL_MEDIA_F2 paddusb, 1
3650IEMIMPL_MEDIA_F2 paddusw, 1
3651IEMIMPL_MEDIA_F2 psubb, 1
3652IEMIMPL_MEDIA_F2 psubw, 1
3653IEMIMPL_MEDIA_F2 psubd, 1
3654IEMIMPL_MEDIA_F2 psubq, 1
3655IEMIMPL_MEDIA_F2 psubsb, 1
3656IEMIMPL_MEDIA_F2 psubsw, 1
3657IEMIMPL_MEDIA_F2 psubusb, 1
3658IEMIMPL_MEDIA_F2 psubusw, 1
3659IEMIMPL_MEDIA_F2 pmullw, 1
3660IEMIMPL_MEDIA_F2 pmulld, 0
3661IEMIMPL_MEDIA_F2 pmulhw, 1
3662IEMIMPL_MEDIA_F2 pmaddwd, 1
3663IEMIMPL_MEDIA_F2 pminub, 1
3664IEMIMPL_MEDIA_F2 pminuw, 0
3665IEMIMPL_MEDIA_F2 pminud, 0
3666IEMIMPL_MEDIA_F2 pminsb, 0
3667IEMIMPL_MEDIA_F2 pminsw, 1
3668IEMIMPL_MEDIA_F2 pminsd, 0
3669IEMIMPL_MEDIA_F2 pmaxub, 1
3670IEMIMPL_MEDIA_F2 pmaxuw, 0
3671IEMIMPL_MEDIA_F2 pmaxud, 0
3672IEMIMPL_MEDIA_F2 pmaxsb, 0
3673IEMIMPL_MEDIA_F2 pmaxsw, 1
3674IEMIMPL_MEDIA_F2 pmaxsd, 0
3675IEMIMPL_MEDIA_F2 pabsb, 1
3676IEMIMPL_MEDIA_F2 pabsw, 1
3677IEMIMPL_MEDIA_F2 pabsd, 1
3678IEMIMPL_MEDIA_F2 psignb, 1
3679IEMIMPL_MEDIA_F2 psignw, 1
3680IEMIMPL_MEDIA_F2 psignd, 1
3681IEMIMPL_MEDIA_F2 phaddw, 1
3682IEMIMPL_MEDIA_F2 phaddd, 1
3683IEMIMPL_MEDIA_F2 phsubw, 1
3684IEMIMPL_MEDIA_F2 phsubd, 1
3685IEMIMPL_MEDIA_F2 phaddsw, 1
3686IEMIMPL_MEDIA_F2 phsubsw, 1
3687IEMIMPL_MEDIA_F2 pmaddubsw, 1
3688IEMIMPL_MEDIA_F2 pmulhrsw, 1
3689IEMIMPL_MEDIA_F2 pmuludq, 1
3690
3691
3692;;
3693; Media instruction working on two full sized registers, but no FXSAVE state argument.
3694;
3695; @param 1 The instruction
3696; @param 2 Whether there is an MMX variant (1) or not (0).
3697;
3698; @param A0 Pointer to the first media register size operand (input/output).
3699; @param A1 Pointer to the second media register size operand (input).
3700;
3701%macro IEMIMPL_MEDIA_OPT_F2 2
3702%if %2 != 0
3703BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3704 PROLOGUE_2_ARGS
3705 IEMIMPL_MMX_PROLOGUE
3706
3707 movq mm0, [A0]
3708 movq mm1, [A1]
3709 %1 mm0, mm1
3710 movq [A0], mm0
3711
3712 IEMIMPL_MMX_EPILOGUE
3713 EPILOGUE_2_ARGS
3714ENDPROC iemAImpl_ %+ %1 %+ _u64
3715%endif
3716
3717BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3718 PROLOGUE_2_ARGS
3719 IEMIMPL_SSE_PROLOGUE
3720
3721 movdqu xmm0, [A0]
3722 movdqu xmm1, [A1]
3723 %1 xmm0, xmm1
3724 movdqu [A0], xmm0
3725
3726 IEMIMPL_SSE_EPILOGUE
3727 EPILOGUE_2_ARGS
3728ENDPROC iemAImpl_ %+ %1 %+ _u128
3729%endmacro
3730
3731IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3732IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3733IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3734IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3735IEMIMPL_MEDIA_OPT_F2 psllw, 1
3736IEMIMPL_MEDIA_OPT_F2 pslld, 1
3737IEMIMPL_MEDIA_OPT_F2 psllq, 1
3738IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3739IEMIMPL_MEDIA_OPT_F2 psrld, 1
3740IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3741IEMIMPL_MEDIA_OPT_F2 psraw, 1
3742IEMIMPL_MEDIA_OPT_F2 psrad, 1
3743IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3744IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3745IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3746IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3747IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3748IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3749IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3750IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3751IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3752IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3753IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3754IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3755IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3756IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3757IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3758IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3759IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3760IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3761IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3762IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3763
3764;;
3765; Media instruction working on one full sized and one half sized register (lower half).
3766;
3767; @param 1 The instruction
3768; @param 2 1 if MMX is included, 0 if not.
3769;
3770; @param A0 Pointer to the first full sized media register operand (input/output).
3771; @param A1 Pointer to the second half sized media register operand (input).
3772;
3773%macro IEMIMPL_MEDIA_F1L1 2
3774 %if %2 != 0
3775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3776 PROLOGUE_2_ARGS
3777 IEMIMPL_MMX_PROLOGUE
3778
3779 movq mm0, [A0]
3780 movq mm1, [A1]
3781 %1 mm0, mm1
3782 movq [A0], mm0
3783
3784 IEMIMPL_MMX_EPILOGUE
3785 EPILOGUE_2_ARGS
3786ENDPROC iemAImpl_ %+ %1 %+ _u64
3787 %endif
3788
3789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3790 PROLOGUE_2_ARGS
3791 IEMIMPL_SSE_PROLOGUE
3792
3793 movdqu xmm0, [A0]
3794 movdqu xmm1, [A1]
3795 %1 xmm0, xmm1
3796 movdqu [A0], xmm0
3797
3798 IEMIMPL_SSE_EPILOGUE
3799 EPILOGUE_2_ARGS
3800ENDPROC iemAImpl_ %+ %1 %+ _u128
3801%endmacro
3802
3803IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3804IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3805IEMIMPL_MEDIA_F1L1 punpckldq, 1
3806IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3807
3808
3809;;
3810; Media instruction working two half sized input registers (lower half) and a full sized
3811; destination register (vpunpckh*).
3812;
3813; @param 1 The instruction
3814;
3815; @param A0 Pointer to the destination register (full sized, output only).
3816; @param A1 Pointer to the first full sized media source register operand, where we
3817; will only use the lower half as input - but we'll be loading it in full.
3818; @param A2 Pointer to the second full sized media source register operand, where we
3819; will only use the lower half as input - but we'll be loading it in full.
3820;
3821%macro IEMIMPL_MEDIA_F1L1L1 1
3822BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3823 PROLOGUE_3_ARGS
3824 IEMIMPL_AVX_PROLOGUE
3825
3826 vmovdqu xmm0, [A1]
3827 vmovdqu xmm1, [A2]
3828 %1 xmm0, xmm0, xmm1
3829 vmovdqu [A0], xmm0
3830
3831 IEMIMPL_AVX_PROLOGUE
3832 EPILOGUE_3_ARGS
3833ENDPROC iemAImpl_ %+ %1 %+ _u128
3834
3835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3836 PROLOGUE_3_ARGS
3837 IEMIMPL_AVX_PROLOGUE
3838
3839 vmovdqu ymm0, [A1]
3840 vmovdqu ymm1, [A2]
3841 %1 ymm0, ymm0, ymm1
3842 vmovdqu [A0], ymm0
3843
3844 IEMIMPL_AVX_PROLOGUE
3845 EPILOGUE_3_ARGS
3846ENDPROC iemAImpl_ %+ %1 %+ _u256
3847%endmacro
3848
3849IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3850IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3851IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3852IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3853
3854
3855;;
3856; Media instruction working on one full sized and one half sized register (high half).
3857;
3858; @param 1 The instruction
3859; @param 2 1 if MMX is included, 0 if not.
3860;
3861; @param A0 Pointer to the first full sized media register operand (input/output).
3862; @param A1 Pointer to the second full sized media register operand, where we
3863; will only use the upper half as input - but we'll load it in full.
3864;
3865%macro IEMIMPL_MEDIA_F1H1 2
3866IEMIMPL_MEDIA_F1L1 %1, %2
3867%endmacro
3868
3869IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3870IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3871IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3872IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3873
3874
3875;;
3876; Media instruction working two half sized input registers (high half) and a full sized
3877; destination register (vpunpckh*).
3878;
3879; @param 1 The instruction
3880;
3881; @param A0 Pointer to the destination register (full sized, output only).
3882; @param A1 Pointer to the first full sized media source register operand, where we
3883; will only use the upper half as input - but we'll be loading it in full.
3884; @param A2 Pointer to the second full sized media source register operand, where we
3885; will only use the upper half as input - but we'll be loading it in full.
3886;
3887%macro IEMIMPL_MEDIA_F1H1H1 1
3888IEMIMPL_MEDIA_F1L1L1 %1
3889%endmacro
3890
3891IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3892IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3893IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3894IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3895
3896
3897;
3898; Shufflers with evil 8-bit immediates.
3899;
3900
3901BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3902 PROLOGUE_3_ARGS
3903 IEMIMPL_MMX_PROLOGUE
3904
3905 movq mm1, [A1]
3906 movq mm0, mm0 ; paranoia!
3907 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3908 lea T1, [.imm0 xWrtRIP]
3909 lea T1, [T1 + T0]
3910 call T1
3911 movq [A0], mm0
3912
3913 IEMIMPL_MMX_EPILOGUE
3914 EPILOGUE_3_ARGS
3915%assign bImm 0
3916%rep 256
3917.imm %+ bImm:
3918 pshufw mm0, mm1, bImm
3919 ret
3920 %assign bImm bImm + 1
3921%endrep
3922.immEnd: ; 256*5 == 0x500
3923dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3924dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3925ENDPROC iemAImpl_pshufw_u64
3926
3927
3928%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3929BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3930 PROLOGUE_3_ARGS
3931 IEMIMPL_SSE_PROLOGUE
3932
3933 movdqu xmm1, [A1]
3934 movdqu xmm0, xmm1 ; paranoia!
3935 lea T1, [.imm0 xWrtRIP]
3936 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3937 lea T1, [T1 + T0*2]
3938 call T1
3939 movdqu [A0], xmm0
3940
3941 IEMIMPL_SSE_EPILOGUE
3942 EPILOGUE_3_ARGS
3943 %assign bImm 0
3944 %rep 256
3945.imm %+ bImm:
3946 %1 xmm0, xmm1, bImm
3947 ret
3948 %assign bImm bImm + 1
3949 %endrep
3950.immEnd: ; 256*6 == 0x600
3951dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3952dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3953ENDPROC iemAImpl_ %+ %1 %+ _u128
3954%endmacro
3955
3956IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3957IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3958IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3959
3960
3961%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3962BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3963 PROLOGUE_3_ARGS
3964 IEMIMPL_SSE_PROLOGUE
3965
3966 vmovdqu ymm1, [A1]
3967 vmovdqu ymm0, ymm1 ; paranoia!
3968 lea T1, [.imm0 xWrtRIP]
3969 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3970 lea T1, [T1 + T0*2]
3971 call T1
3972 vmovdqu [A0], ymm0
3973
3974 IEMIMPL_SSE_EPILOGUE
3975 EPILOGUE_3_ARGS
3976 %assign bImm 0
3977 %rep 256
3978.imm %+ bImm:
3979 %1 ymm0, ymm1, bImm
3980 ret
3981 %assign bImm bImm + 1
3982 %endrep
3983.immEnd: ; 256*6 == 0x600
3984dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3985dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3986ENDPROC iemAImpl_ %+ %1 %+ _u256
3987%endmacro
3988
3989IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3990IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3991IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3992
3993
3994;
3995; Shifts with evil 8-bit immediates.
3996;
3997
3998%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4000 PROLOGUE_2_ARGS
4001 IEMIMPL_MMX_PROLOGUE
4002
4003 movq mm0, [A0]
4004 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4005 lea T1, [.imm0 xWrtRIP]
4006 lea T1, [T1 + T0]
4007 call T1
4008 movq [A0], mm0
4009
4010 IEMIMPL_MMX_EPILOGUE
4011 EPILOGUE_2_ARGS
4012%assign bImm 0
4013%rep 256
4014.imm %+ bImm:
4015 %1 mm0, bImm
4016 ret
4017 %assign bImm bImm + 1
4018%endrep
4019.immEnd: ; 256*5 == 0x500
4020dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4021dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4022ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4023%endmacro
4024
4025IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4026IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4027IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4028IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4029IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4030IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4031IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4032IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4033
4034
4035%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4037 PROLOGUE_2_ARGS
4038 IEMIMPL_SSE_PROLOGUE
4039
4040 movdqu xmm0, [A0]
4041 lea T1, [.imm0 xWrtRIP]
4042 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4043 lea T1, [T1 + T0*2]
4044 call T1
4045 movdqu [A0], xmm0
4046
4047 IEMIMPL_SSE_EPILOGUE
4048 EPILOGUE_2_ARGS
4049 %assign bImm 0
4050 %rep 256
4051.imm %+ bImm:
4052 %1 xmm0, bImm
4053 ret
4054 %assign bImm bImm + 1
4055 %endrep
4056.immEnd: ; 256*6 == 0x600
4057dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4058dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4059ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4060%endmacro
4061
4062IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4063IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4064IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4065IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4066IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4067IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4068IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4069IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4070IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4071IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4072
4073
4074;
4075; Move byte mask.
4076;
4077
4078BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4079 PROLOGUE_2_ARGS
4080 IEMIMPL_MMX_PROLOGUE
4081
4082 movq mm1, [A1]
4083 pmovmskb T0, mm1
4084 mov [A0], T0
4085%ifdef RT_ARCH_X86
4086 mov dword [A0 + 4], 0
4087%endif
4088 IEMIMPL_MMX_EPILOGUE
4089 EPILOGUE_2_ARGS
4090ENDPROC iemAImpl_pmovmskb_u64
4091
4092BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4093 PROLOGUE_2_ARGS
4094 IEMIMPL_SSE_PROLOGUE
4095
4096 movdqu xmm1, [A1]
4097 pmovmskb T0, xmm1
4098 mov [A0], T0
4099%ifdef RT_ARCH_X86
4100 mov dword [A0 + 4], 0
4101%endif
4102 IEMIMPL_SSE_EPILOGUE
4103 EPILOGUE_2_ARGS
4104ENDPROC iemAImpl_pmovmskb_u128
4105
4106BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4107 PROLOGUE_2_ARGS
4108 IEMIMPL_AVX_PROLOGUE
4109
4110 vmovdqu ymm1, [A1]
4111 vpmovmskb T0, ymm1
4112 mov [A0], T0
4113%ifdef RT_ARCH_X86
4114 mov dword [A0 + 4], 0
4115%endif
4116 IEMIMPL_AVX_EPILOGUE
4117 EPILOGUE_2_ARGS
4118ENDPROC iemAImpl_vpmovmskb_u256
4119
4120
4121;;
4122; Media instruction working on two full sized source registers and one destination (AVX).
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4127; @param A1 Pointer to the destination media register size operand (output).
4128; @param A2 Pointer to the first source media register size operand (input).
4129; @param A3 Pointer to the second source media register size operand (input).
4130;
4131%macro IEMIMPL_MEDIA_F3 1
4132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4133 PROLOGUE_4_ARGS
4134 IEMIMPL_AVX_PROLOGUE
4135
4136 vmovdqu xmm0, [A2]
4137 vmovdqu xmm1, [A3]
4138 %1 xmm0, xmm0, xmm1
4139 vmovdqu [A1], xmm0
4140
4141 IEMIMPL_AVX_PROLOGUE
4142 EPILOGUE_4_ARGS
4143ENDPROC iemAImpl_ %+ %1 %+ _u128
4144
4145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4146 PROLOGUE_4_ARGS
4147 IEMIMPL_AVX_PROLOGUE
4148
4149 vmovdqu ymm0, [A2]
4150 vmovdqu ymm1, [A3]
4151 %1 ymm0, ymm0, ymm1
4152 vmovdqu [A1], ymm0
4153
4154 IEMIMPL_AVX_PROLOGUE
4155 EPILOGUE_4_ARGS
4156ENDPROC iemAImpl_ %+ %1 %+ _u256
4157%endmacro
4158
4159IEMIMPL_MEDIA_F3 vpshufb
4160IEMIMPL_MEDIA_F3 vpand
4161IEMIMPL_MEDIA_F3 vpminub
4162IEMIMPL_MEDIA_F3 vpminuw
4163IEMIMPL_MEDIA_F3 vpminud
4164IEMIMPL_MEDIA_F3 vpminsb
4165IEMIMPL_MEDIA_F3 vpminsw
4166IEMIMPL_MEDIA_F3 vpminsd
4167IEMIMPL_MEDIA_F3 vpmaxub
4168IEMIMPL_MEDIA_F3 vpmaxuw
4169IEMIMPL_MEDIA_F3 vpmaxud
4170IEMIMPL_MEDIA_F3 vpmaxsb
4171IEMIMPL_MEDIA_F3 vpmaxsw
4172IEMIMPL_MEDIA_F3 vpmaxsd
4173IEMIMPL_MEDIA_F3 vpandn
4174IEMIMPL_MEDIA_F3 vpor
4175IEMIMPL_MEDIA_F3 vpxor
4176IEMIMPL_MEDIA_F3 vpcmpeqb
4177IEMIMPL_MEDIA_F3 vpcmpeqw
4178IEMIMPL_MEDIA_F3 vpcmpeqd
4179IEMIMPL_MEDIA_F3 vpcmpeqq
4180IEMIMPL_MEDIA_F3 vpcmpgtb
4181IEMIMPL_MEDIA_F3 vpcmpgtw
4182IEMIMPL_MEDIA_F3 vpcmpgtd
4183IEMIMPL_MEDIA_F3 vpcmpgtq
4184IEMIMPL_MEDIA_F3 vpaddb
4185IEMIMPL_MEDIA_F3 vpaddw
4186IEMIMPL_MEDIA_F3 vpaddd
4187IEMIMPL_MEDIA_F3 vpaddq
4188IEMIMPL_MEDIA_F3 vpsubb
4189IEMIMPL_MEDIA_F3 vpsubw
4190IEMIMPL_MEDIA_F3 vpsubd
4191IEMIMPL_MEDIA_F3 vpsubq
4192
4193
4194;;
4195; Media instruction working on two full sized source registers and one destination (AVX),
4196; but no XSAVE state pointer argument.
4197;
4198; @param 1 The instruction
4199;
4200; @param A0 Pointer to the destination media register size operand (output).
4201; @param A1 Pointer to the first source media register size operand (input).
4202; @param A2 Pointer to the second source media register size operand (input).
4203;
4204%macro IEMIMPL_MEDIA_OPT_F3 1
4205BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4206 PROLOGUE_3_ARGS
4207 IEMIMPL_AVX_PROLOGUE
4208
4209 vmovdqu xmm0, [A1]
4210 vmovdqu xmm1, [A2]
4211 %1 xmm0, xmm0, xmm1
4212 vmovdqu [A0], xmm0
4213
4214 IEMIMPL_AVX_PROLOGUE
4215 EPILOGUE_3_ARGS
4216ENDPROC iemAImpl_ %+ %1 %+ _u128
4217
4218BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4219 PROLOGUE_3_ARGS
4220 IEMIMPL_AVX_PROLOGUE
4221
4222 vmovdqu ymm0, [A1]
4223 vmovdqu ymm1, [A2]
4224 %1 ymm0, ymm0, ymm1
4225 vmovdqu [A0], ymm0
4226
4227 IEMIMPL_AVX_PROLOGUE
4228 EPILOGUE_3_ARGS
4229ENDPROC iemAImpl_ %+ %1 %+ _u256
4230%endmacro
4231
4232IEMIMPL_MEDIA_OPT_F3 vpacksswb
4233IEMIMPL_MEDIA_OPT_F3 vpackssdw
4234IEMIMPL_MEDIA_OPT_F3 vpackuswb
4235IEMIMPL_MEDIA_OPT_F3 vpackusdw
4236IEMIMPL_MEDIA_OPT_F3 vpmullw
4237IEMIMPL_MEDIA_OPT_F3 vpmulld
4238IEMIMPL_MEDIA_OPT_F3 vpmulhw
4239IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4240IEMIMPL_MEDIA_OPT_F3 vpavgb
4241IEMIMPL_MEDIA_OPT_F3 vpavgw
4242IEMIMPL_MEDIA_OPT_F3 vpsignb
4243IEMIMPL_MEDIA_OPT_F3 vpsignw
4244IEMIMPL_MEDIA_OPT_F3 vpsignd
4245IEMIMPL_MEDIA_OPT_F3 vphaddw
4246IEMIMPL_MEDIA_OPT_F3 vphaddd
4247IEMIMPL_MEDIA_OPT_F3 vphsubw
4248IEMIMPL_MEDIA_OPT_F3 vphsubd
4249IEMIMPL_MEDIA_OPT_F3 vphaddsw
4250IEMIMPL_MEDIA_OPT_F3 vphsubsw
4251IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4252IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4253IEMIMPL_MEDIA_OPT_F3 vpsadbw
4254IEMIMPL_MEDIA_OPT_F3 vpmuldq
4255IEMIMPL_MEDIA_OPT_F3 vpmuludq
4256IEMIMPL_MEDIA_OPT_F3 vunpcklps
4257IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4258IEMIMPL_MEDIA_OPT_F3 vunpckhps
4259IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4260
4261;;
4262; Media instruction working on one full sized source registers and one destination (AVX),
4263; but no XSAVE state pointer argument.
4264;
4265; @param 1 The instruction
4266; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4267;
4268; @param A0 Pointer to the destination media register size operand (output).
4269; @param A1 Pointer to the source media register size operand (input).
4270;
4271%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4273 PROLOGUE_2_ARGS
4274 IEMIMPL_AVX_PROLOGUE
4275
4276 vmovdqu xmm0, [A1]
4277 %1 xmm0, xmm0
4278 vmovdqu [A0], xmm0
4279
4280 IEMIMPL_AVX_PROLOGUE
4281 EPILOGUE_2_ARGS
4282ENDPROC iemAImpl_ %+ %1 %+ _u128
4283
4284 %if %2 == 1
4285BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4286 PROLOGUE_2_ARGS
4287 IEMIMPL_AVX_PROLOGUE
4288
4289 vmovdqu ymm0, [A1]
4290 %1 ymm0, ymm0
4291 vmovdqu [A0], ymm0
4292
4293 IEMIMPL_AVX_PROLOGUE
4294 EPILOGUE_2_ARGS
4295ENDPROC iemAImpl_ %+ %1 %+ _u256
4296 %endif
4297%endmacro
4298
4299IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4300IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4301IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4302IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4303
4304
4305;
4306; The SSE 4.2 crc32
4307;
4308; @param A1 Pointer to the 32-bit destination.
4309; @param A2 The source operand, sized according to the suffix.
4310;
4311BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4312 PROLOGUE_2_ARGS
4313
4314 mov T0_32, [A0]
4315 crc32 T0_32, A1_8
4316 mov [A0], T0_32
4317
4318 EPILOGUE_2_ARGS
4319ENDPROC iemAImpl_crc32_u8
4320
4321BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4322 PROLOGUE_2_ARGS
4323
4324 mov T0_32, [A0]
4325 crc32 T0_32, A1_16
4326 mov [A0], T0_32
4327
4328 EPILOGUE_2_ARGS
4329ENDPROC iemAImpl_crc32_u16
4330
4331BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4332 PROLOGUE_2_ARGS
4333
4334 mov T0_32, [A0]
4335 crc32 T0_32, A1_32
4336 mov [A0], T0_32
4337
4338 EPILOGUE_2_ARGS
4339ENDPROC iemAImpl_crc32_u32
4340
4341%ifdef RT_ARCH_AMD64
4342BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4343 PROLOGUE_2_ARGS
4344
4345 mov T0_32, [A0]
4346 crc32 T0, A1
4347 mov [A0], T0_32
4348
4349 EPILOGUE_2_ARGS
4350ENDPROC iemAImpl_crc32_u64
4351%endif
4352
4353
4354;
4355; PTEST (SSE 4.1)
4356;
4357; @param A0 Pointer to the first source operand (aka readonly destination).
4358; @param A1 Pointer to the second source operand.
4359; @param A2 Pointer to the EFLAGS register.
4360;
4361BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4362 PROLOGUE_3_ARGS
4363 IEMIMPL_SSE_PROLOGUE
4364
4365 movdqu xmm0, [A0]
4366 movdqu xmm1, [A1]
4367 ptest xmm0, xmm1
4368 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4369
4370 IEMIMPL_SSE_EPILOGUE
4371 EPILOGUE_3_ARGS
4372ENDPROC iemAImpl_ptest_u128
4373
4374BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4375 PROLOGUE_3_ARGS
4376 IEMIMPL_SSE_PROLOGUE
4377
4378 vmovdqu ymm0, [A0]
4379 vmovdqu ymm1, [A1]
4380 vptest ymm0, ymm1
4381 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4382
4383 IEMIMPL_SSE_EPILOGUE
4384 EPILOGUE_3_ARGS
4385ENDPROC iemAImpl_vptest_u256
4386
4387
4388;;
4389; Template for the [v]pmov{s,z}x* instructions
4390;
4391; @param 1 The instruction
4392;
4393; @param A0 Pointer to the destination media register size operand (output).
4394; @param A1 The source operand value (input).
4395;
4396%macro IEMIMPL_V_PMOV_SZ_X 1
4397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4398 PROLOGUE_2_ARGS
4399 IEMIMPL_SSE_PROLOGUE
4400
4401 movd xmm0, A1
4402 %1 xmm0, xmm0
4403 vmovdqu [A0], xmm0
4404
4405 IEMIMPL_SSE_PROLOGUE
4406 EPILOGUE_2_ARGS
4407ENDPROC iemAImpl_ %+ %1 %+ _u128
4408
4409BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4410 PROLOGUE_2_ARGS
4411 IEMIMPL_AVX_PROLOGUE
4412
4413 movd xmm0, A1
4414 v %+ %1 xmm0, xmm0
4415 vmovdqu [A0], xmm0
4416
4417 IEMIMPL_AVX_PROLOGUE
4418 EPILOGUE_2_ARGS
4419ENDPROC iemAImpl_v %+ %1 %+ _u128
4420
4421BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4422 PROLOGUE_2_ARGS
4423 IEMIMPL_AVX_PROLOGUE
4424
4425 movdqu xmm0, [A1]
4426 v %+ %1 ymm0, xmm0
4427 vmovdqu [A0], ymm0
4428
4429 IEMIMPL_AVX_PROLOGUE
4430 EPILOGUE_2_ARGS
4431ENDPROC iemAImpl_v %+ %1 %+ _u256
4432%endmacro
4433
4434IEMIMPL_V_PMOV_SZ_X pmovsxbw
4435IEMIMPL_V_PMOV_SZ_X pmovsxbd
4436IEMIMPL_V_PMOV_SZ_X pmovsxbq
4437IEMIMPL_V_PMOV_SZ_X pmovsxwd
4438IEMIMPL_V_PMOV_SZ_X pmovsxwq
4439IEMIMPL_V_PMOV_SZ_X pmovsxdq
4440
4441IEMIMPL_V_PMOV_SZ_X pmovzxbw
4442IEMIMPL_V_PMOV_SZ_X pmovzxbd
4443IEMIMPL_V_PMOV_SZ_X pmovzxbq
4444IEMIMPL_V_PMOV_SZ_X pmovzxwd
4445IEMIMPL_V_PMOV_SZ_X pmovzxwq
4446IEMIMPL_V_PMOV_SZ_X pmovzxdq
4447
4448
4449;;
4450; Need to move this as well somewhere better?
4451;
4452struc IEMSSERESULT
4453 .uResult resd 4
4454 .MXCSR resd 1
4455endstruc
4456
4457
4458;;
4459; Need to move this as well somewhere better?
4460;
4461struc IEMAVX128RESULT
4462 .uResult resd 4
4463 .MXCSR resd 1
4464endstruc
4465
4466
4467;;
4468; Need to move this as well somewhere better?
4469;
4470struc IEMAVX256RESULT
4471 .uResult resd 8
4472 .MXCSR resd 1
4473endstruc
4474
4475
4476;;
4477; Initialize the SSE MXCSR register using the guest value partially to
4478; account for rounding mode.
4479;
4480; @uses 4 bytes of stack to save the original value, T0.
4481; @param 1 Expression giving the address of the FXSTATE of the guest.
4482;
4483%macro SSE_LD_FXSTATE_MXCSR 1
4484 sub xSP, 4
4485
4486 stmxcsr [xSP]
4487 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4488 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4489 or T0_32, X86_MXCSR_XCPT_MASK
4490 sub xSP, 4
4491 mov [xSP], T0_32
4492 ldmxcsr [xSP]
4493 add xSP, 4
4494%endmacro
4495
4496
4497;;
4498; Restores the SSE MXCSR register with the original value.
4499;
4500; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4501; @param 1 Expression giving the address where to return the MXCSR value.
4502; @param 2 Expression giving the address of the FXSTATE of the guest.
4503;
4504; @note Restores the stack pointer.
4505;
4506%macro SSE_ST_FXSTATE_MXCSR 2
4507 sub xSP, 4
4508 stmxcsr [xSP]
4509 mov T0_32, [xSP]
4510 add xSP, 4
4511 ; Merge the status bits into the original MXCSR value.
4512 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4513 and T0_32, X86_MXCSR_XCPT_FLAGS
4514 or T0_32, T1_32
4515 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4516
4517 ldmxcsr [xSP]
4518 add xSP, 4
4519%endmacro
4520
4521
4522;;
4523; Initialize the SSE MXCSR register using the guest value partially to
4524; account for rounding mode.
4525;
4526; @uses 4 bytes of stack to save the original value.
4527; @param 1 Expression giving the address of the FXSTATE of the guest.
4528;
4529%macro AVX_LD_XSAVEAREA_MXCSR 1
4530 sub xSP, 4
4531
4532 stmxcsr [xSP]
4533 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4534 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4535 sub xSP, 4
4536 mov [xSP], T0_32
4537 ldmxcsr [xSP]
4538 add xSP, 4
4539%endmacro
4540
4541
4542;;
4543; Restores the AVX128 MXCSR register with the original value.
4544;
4545; @param 1 Expression giving the address where to return the MXCSR value.
4546;
4547; @note Restores the stack pointer.
4548;
4549%macro AVX128_ST_XSAVEAREA_MXCSR 1
4550 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4551
4552 ldmxcsr [xSP]
4553 add xSP, 4
4554%endmacro
4555
4556
4557;;
4558; Restores the AVX256 MXCSR register with the original value.
4559;
4560; @param 1 Expression giving the address where to return the MXCSR value.
4561;
4562; @note Restores the stack pointer.
4563;
4564%macro AVX256_ST_XSAVEAREA_MXCSR 1
4565 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4566
4567 ldmxcsr [xSP]
4568 add xSP, 4
4569%endmacro
4570
4571
4572;;
4573; Floating point instruction working on two full sized registers.
4574;
4575; @param 1 The instruction
4576; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4577;
4578; @param A0 FPU context (FXSTATE or XSAVEAREA).
4579; @param A1 Where to return the result including the MXCSR value.
4580; @param A2 Pointer to the first media register size operand (input/output).
4581; @param A3 Pointer to the second media register size operand (input).
4582;
4583%macro IEMIMPL_FP_F2 2
4584BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4585 PROLOGUE_4_ARGS
4586 IEMIMPL_SSE_PROLOGUE
4587 SSE_LD_FXSTATE_MXCSR A0
4588
4589 movdqu xmm0, [A2]
4590 movdqu xmm1, [A3]
4591 %1 xmm0, xmm1
4592 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4593
4594 SSE_ST_FXSTATE_MXCSR A1, A0
4595 IEMIMPL_SSE_PROLOGUE
4596 EPILOGUE_4_ARGS
4597ENDPROC iemAImpl_ %+ %1 %+ _u128
4598
4599 %if %2 == 3
4600BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4601 PROLOGUE_4_ARGS
4602 IEMIMPL_AVX_PROLOGUE
4603 AVX_LD_XSAVEAREA_MXCSR A0
4604
4605 vmovdqu xmm0, [A2]
4606 vmovdqu xmm1, [A3]
4607 v %+ %1 xmm0, xmm0, xmm1
4608 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4609
4610 AVX128_ST_XSAVEAREA_MXCSR A1
4611 IEMIMPL_AVX_PROLOGUE
4612 EPILOGUE_4_ARGS
4613ENDPROC iemAImpl_v %+ %1 %+ _u128
4614
4615BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4616 PROLOGUE_4_ARGS
4617 IEMIMPL_AVX_PROLOGUE
4618 AVX_LD_XSAVEAREA_MXCSR A0
4619
4620 vmovdqu ymm0, [A2]
4621 vmovdqu ymm1, [A3]
4622 v %+ %1 ymm0, ymm0, ymm1
4623 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4624
4625 AVX256_ST_XSAVEAREA_MXCSR A1
4626 IEMIMPL_AVX_PROLOGUE
4627 EPILOGUE_4_ARGS
4628ENDPROC iemAImpl_v %+ %1 %+ _u256
4629 %elif %2 == 2
4630BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4631 PROLOGUE_4_ARGS
4632 IEMIMPL_AVX_PROLOGUE
4633 AVX_LD_XSAVEAREA_MXCSR A0
4634
4635 vmovdqu xmm0, [A2]
4636 vmovdqu xmm1, [A3]
4637 v %+ %1 xmm0, xmm1
4638 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4639
4640 AVX128_ST_XSAVEAREA_MXCSR A1
4641 IEMIMPL_AVX_PROLOGUE
4642 EPILOGUE_4_ARGS
4643ENDPROC iemAImpl_v %+ %1 %+ _u128
4644
4645BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4646 PROLOGUE_4_ARGS
4647 IEMIMPL_AVX_PROLOGUE
4648 AVX_LD_XSAVEAREA_MXCSR A0
4649
4650 vmovdqu ymm0, [A2]
4651 vmovdqu ymm1, [A3]
4652 v %+ %1 ymm0, ymm1
4653 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4654
4655 AVX256_ST_XSAVEAREA_MXCSR A1
4656 IEMIMPL_AVX_PROLOGUE
4657 EPILOGUE_4_ARGS
4658ENDPROC iemAImpl_v %+ %1 %+ _u256
4659 %endif
4660%endmacro
4661
4662IEMIMPL_FP_F2 addps, 3
4663IEMIMPL_FP_F2 addpd, 3
4664IEMIMPL_FP_F2 mulps, 3
4665IEMIMPL_FP_F2 mulpd, 3
4666IEMIMPL_FP_F2 subps, 3
4667IEMIMPL_FP_F2 subpd, 3
4668IEMIMPL_FP_F2 minps, 3
4669IEMIMPL_FP_F2 minpd, 3
4670IEMIMPL_FP_F2 divps, 3
4671IEMIMPL_FP_F2 divpd, 3
4672IEMIMPL_FP_F2 maxps, 3
4673IEMIMPL_FP_F2 maxpd, 3
4674IEMIMPL_FP_F2 haddps, 3
4675IEMIMPL_FP_F2 haddpd, 3
4676IEMIMPL_FP_F2 hsubps, 3
4677IEMIMPL_FP_F2 hsubpd, 3
4678IEMIMPL_FP_F2 addsubps, 3
4679IEMIMPL_FP_F2 addsubpd, 3
4680
4681
4682;;
4683; These are actually unary operations but to keep it simple
4684; we treat them as binary for now, so the output result is
4685; always in sync with the register where the result might get written
4686; to.
4687IEMIMPL_FP_F2 sqrtps, 2
4688IEMIMPL_FP_F2 rsqrtps, 2
4689IEMIMPL_FP_F2 sqrtpd, 2
4690IEMIMPL_FP_F2 cvtdq2ps, 2
4691IEMIMPL_FP_F2 cvtps2dq, 2
4692IEMIMPL_FP_F2 cvttps2dq, 2
4693IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4694IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4695IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4696
4697
4698;;
4699; Floating point instruction working on a full sized register and a single precision operand.
4700;
4701; @param 1 The instruction
4702;
4703; @param A0 FPU context (FXSTATE or XSAVEAREA).
4704; @param A1 Where to return the result including the MXCSR value.
4705; @param A2 Pointer to the first media register size operand (input/output).
4706; @param A3 Pointer to the second single precision floating point value (input).
4707;
4708%macro IEMIMPL_FP_F2_R32 1
4709BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4710 PROLOGUE_4_ARGS
4711 IEMIMPL_SSE_PROLOGUE
4712 SSE_LD_FXSTATE_MXCSR A0
4713
4714 movdqu xmm0, [A2]
4715 movd xmm1, [A3]
4716 %1 xmm0, xmm1
4717 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4718
4719 SSE_ST_FXSTATE_MXCSR A1, A0
4720 IEMIMPL_SSE_EPILOGUE
4721 EPILOGUE_4_ARGS
4722ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4723
4724BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4725 PROLOGUE_4_ARGS
4726 IEMIMPL_AVX_PROLOGUE
4727 AVX_LD_XSAVEAREA_MXCSR A0
4728
4729 vmovdqu xmm0, [A2]
4730 vmovd xmm1, [A3]
4731 v %+ %1 xmm0, xmm0, xmm1
4732 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4733
4734 AVX128_ST_XSAVEAREA_MXCSR A1
4735 IEMIMPL_AVX_PROLOGUE
4736 EPILOGUE_4_ARGS
4737ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4738%endmacro
4739
4740IEMIMPL_FP_F2_R32 addss
4741IEMIMPL_FP_F2_R32 mulss
4742IEMIMPL_FP_F2_R32 subss
4743IEMIMPL_FP_F2_R32 minss
4744IEMIMPL_FP_F2_R32 divss
4745IEMIMPL_FP_F2_R32 maxss
4746IEMIMPL_FP_F2_R32 cvtss2sd
4747IEMIMPL_FP_F2_R32 sqrtss
4748IEMIMPL_FP_F2_R32 rsqrtss
4749
4750
4751;;
4752; Floating point instruction working on a full sized register and a double precision operand.
4753;
4754; @param 1 The instruction
4755;
4756; @param A0 FPU context (FXSTATE or XSAVEAREA).
4757; @param A1 Where to return the result including the MXCSR value.
4758; @param A2 Pointer to the first media register size operand (input/output).
4759; @param A3 Pointer to the second double precision floating point value (input).
4760;
4761%macro IEMIMPL_FP_F2_R64 1
4762BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4763 PROLOGUE_4_ARGS
4764 IEMIMPL_SSE_PROLOGUE
4765 SSE_LD_FXSTATE_MXCSR A0
4766
4767 movdqu xmm0, [A2]
4768 movq xmm1, [A3]
4769 %1 xmm0, xmm1
4770 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4771
4772 SSE_ST_FXSTATE_MXCSR A1, A0
4773 IEMIMPL_SSE_EPILOGUE
4774 EPILOGUE_4_ARGS
4775ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4776
4777BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4778 PROLOGUE_4_ARGS
4779 IEMIMPL_AVX_PROLOGUE
4780 AVX_LD_XSAVEAREA_MXCSR A0
4781
4782 vmovdqu xmm0, [A2]
4783 vmovq xmm1, [A3]
4784 v %+ %1 xmm0, xmm0, xmm1
4785 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4786
4787 AVX128_ST_XSAVEAREA_MXCSR A1
4788 IEMIMPL_AVX_EPILOGUE
4789 EPILOGUE_4_ARGS
4790ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4791%endmacro
4792
4793IEMIMPL_FP_F2_R64 addsd
4794IEMIMPL_FP_F2_R64 mulsd
4795IEMIMPL_FP_F2_R64 subsd
4796IEMIMPL_FP_F2_R64 minsd
4797IEMIMPL_FP_F2_R64 divsd
4798IEMIMPL_FP_F2_R64 maxsd
4799IEMIMPL_FP_F2_R64 cvtsd2ss
4800IEMIMPL_FP_F2_R64 sqrtsd
4801
4802
4803;;
4804; Macro for the cvtpd2ps/cvtps2pd instructions.
4805;
4806; 1 The instruction name.
4807; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4808;
4809; @param A0 FPU context (FXSTATE or XSAVEAREA).
4810; @param A1 Where to return the result including the MXCSR value.
4811; @param A2 Pointer to the first media register size operand (input/output).
4812; @param A3 Pointer to the second media register size operand (input).
4813;
4814%macro IEMIMPL_CVT_F2 2
4815BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4816 PROLOGUE_4_ARGS
4817 IEMIMPL_SSE_PROLOGUE
4818 SSE_LD_FXSTATE_MXCSR A0
4819
4820 movdqu xmm0, [A2]
4821 movdqu xmm1, [A3]
4822 %1 xmm0, xmm1
4823 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4824
4825 SSE_ST_FXSTATE_MXCSR A1, A0
4826 IEMIMPL_SSE_EPILOGUE
4827 EPILOGUE_4_ARGS
4828ENDPROC iemAImpl_ %+ %1 %+ _u128
4829
4830BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4831 PROLOGUE_4_ARGS
4832 IEMIMPL_AVX_PROLOGUE
4833 AVX_LD_XSAVEAREA_MXCSR A0
4834
4835 vmovdqu xmm0, [A2]
4836 vmovdqu xmm1, [A3]
4837 v %+ %1 xmm0, xmm1
4838 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4839
4840 AVX128_ST_XSAVEAREA_MXCSR A1
4841 IEMIMPL_AVX_EPILOGUE
4842 EPILOGUE_4_ARGS
4843ENDPROC iemAImpl_v %+ %1 %+ _u128
4844
4845BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4846 PROLOGUE_4_ARGS
4847 IEMIMPL_AVX_PROLOGUE
4848 AVX_LD_XSAVEAREA_MXCSR A0
4849
4850 vmovdqu ymm0, [A2]
4851 vmovdqu ymm1, [A3]
4852 %if %2 == 0
4853 v %+ %1 xmm0, ymm1
4854 %else
4855 v %+ %1 ymm0, xmm1
4856 %endif
4857 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4858
4859 AVX256_ST_XSAVEAREA_MXCSR A1
4860 IEMIMPL_AVX_EPILOGUE
4861 EPILOGUE_4_ARGS
4862ENDPROC iemAImpl_v %+ %1 %+ _u256
4863%endmacro
4864
4865IEMIMPL_CVT_F2 cvtpd2ps, 0
4866IEMIMPL_CVT_F2 cvtps2pd, 1
4867
4868
4869;;
4870; shufps instructions with 8-bit immediates.
4871;
4872; @param A0 Pointer to the destination media register size operand (input/output).
4873; @param A1 Pointer to the first source media register size operand (input).
4874; @param A2 The 8-bit immediate
4875;
4876BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4877 PROLOGUE_3_ARGS
4878 IEMIMPL_SSE_PROLOGUE
4879
4880 movdqu xmm0, [A0]
4881 movdqu xmm1, [A1]
4882 lea T1, [.imm0 xWrtRIP]
4883 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4884 lea T1, [T1 + T0*2]
4885 call T1
4886 movdqu [A0], xmm0
4887
4888 IEMIMPL_SSE_EPILOGUE
4889 EPILOGUE_3_ARGS
4890 %assign bImm 0
4891 %rep 256
4892.imm %+ bImm:
4893 shufps xmm0, xmm1, bImm
4894 ret
4895 int3
4896 %assign bImm bImm + 1
4897 %endrep
4898.immEnd: ; 256*6 == 0x600
4899dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4900dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4901ENDPROC iemAImpl_shufps_u128
4902
4903
4904;;
4905; shufpd instruction with 8-bit immediates.
4906;
4907; @param A0 Pointer to the destination media register size operand (input/output).
4908; @param A1 Pointer to the first source media register size operand (input).
4909; @param A2 The 8-bit immediate
4910;
4911BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4912 PROLOGUE_3_ARGS
4913 IEMIMPL_SSE_PROLOGUE
4914
4915 movdqu xmm0, [A0]
4916 movdqu xmm1, [A1]
4917 lea T1, [.imm0 xWrtRIP]
4918 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4919 lea T1, [T1 + T0*2]
4920 call T1
4921 movdqu [A0], xmm0
4922
4923 IEMIMPL_SSE_EPILOGUE
4924 EPILOGUE_3_ARGS
4925 %assign bImm 0
4926 %rep 256
4927.imm %+ bImm:
4928 shufpd xmm0, xmm1, bImm
4929 ret
4930 %assign bImm bImm + 1
4931 %endrep
4932.immEnd: ; 256*6 == 0x600
4933dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4934dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4935ENDPROC iemAImpl_shufpd_u128
4936
4937
4938;;
4939; vshufp{s,d} instructions with 8-bit immediates.
4940;
4941; @param 1 The instruction name.
4942;
4943; @param A0 Pointer to the destination media register size operand (output).
4944; @param A1 Pointer to the first source media register size operand (input).
4945; @param A2 Pointer to the second source media register size operand (input).
4946; @param A3 The 8-bit immediate
4947;
4948%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4950 PROLOGUE_4_ARGS
4951 IEMIMPL_AVX_PROLOGUE
4952
4953 movdqu xmm0, [A1]
4954 movdqu xmm1, [A2]
4955 lea T1, [.imm0 xWrtRIP]
4956 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4957 lea T1, [T1 + T0*2]
4958 call T1
4959 movdqu [A0], xmm0
4960
4961 IEMIMPL_AVX_EPILOGUE
4962 EPILOGUE_4_ARGS
4963 %assign bImm 0
4964 %rep 256
4965.imm %+ bImm:
4966 %1 xmm0, xmm0, xmm1, bImm
4967 ret
4968 %assign bImm bImm + 1
4969 %endrep
4970.immEnd: ; 256*6 == 0x600
4971dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4972dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4973ENDPROC iemAImpl_ %+ %1 %+ _u128
4974
4975BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4976 PROLOGUE_4_ARGS
4977 IEMIMPL_AVX_PROLOGUE
4978
4979 vmovdqu ymm0, [A1]
4980 vmovdqu ymm1, [A2]
4981 lea T1, [.imm0 xWrtRIP]
4982 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4983 lea T1, [T1 + T0*2]
4984 call T1
4985 vmovdqu [A0], ymm0
4986
4987 IEMIMPL_AVX_EPILOGUE
4988 EPILOGUE_4_ARGS
4989 %assign bImm 0
4990 %rep 256
4991.imm %+ bImm:
4992 %1 ymm0, ymm0, ymm1, bImm
4993 ret
4994 %assign bImm bImm + 1
4995 %endrep
4996.immEnd: ; 256*6 == 0x600
4997dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4998dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4999ENDPROC iemAImpl_ %+ %1 %+ _u256
5000%endmacro
5001
5002IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5003IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5004
5005
5006;;
5007; One of the [p]blendv{b,ps,pd} variants
5008;
5009; @param 1 The instruction
5010;
5011; @param A0 Pointer to the first media register sized operand (input/output).
5012; @param A1 Pointer to the second media sized value (input).
5013; @param A2 Pointer to the media register sized mask value (input).
5014;
5015%macro IEMIMPL_P_BLEND 1
5016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5017 PROLOGUE_3_ARGS
5018 IEMIMPL_SSE_PROLOGUE
5019
5020 movdqu xmm0, [A2] ; This is implicit
5021 movdqu xmm1, [A0]
5022 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5023 %1 xmm1, xmm2
5024 movdqu [A0], xmm1
5025
5026 IEMIMPL_SSE_PROLOGUE
5027 EPILOGUE_3_ARGS
5028ENDPROC iemAImpl_ %+ %1 %+ _u128
5029%endmacro
5030
5031IEMIMPL_P_BLEND pblendvb
5032IEMIMPL_P_BLEND blendvps
5033IEMIMPL_P_BLEND blendvpd
5034
5035
5036;;
5037; One of the v[p]blendv{b,ps,pd} variants
5038;
5039; @param 1 The instruction
5040;
5041; @param A0 Pointer to the first media register sized operand (output).
5042; @param A1 Pointer to the first media register sized operand (input).
5043; @param A2 Pointer to the second media register sized operand (input).
5044; @param A3 Pointer to the media register sized mask value (input).
5045%macro IEMIMPL_AVX_P_BLEND 1
5046BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5047 PROLOGUE_4_ARGS
5048 IEMIMPL_AVX_PROLOGUE
5049
5050 vmovdqu xmm0, [A1]
5051 vmovdqu xmm1, [A2]
5052 vmovdqu xmm2, [A3]
5053 %1 xmm0, xmm0, xmm1, xmm2
5054 vmovdqu [A0], xmm0
5055
5056 IEMIMPL_AVX_PROLOGUE
5057 EPILOGUE_4_ARGS
5058ENDPROC iemAImpl_ %+ %1 %+ _u128
5059
5060BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5061 PROLOGUE_4_ARGS
5062 IEMIMPL_AVX_PROLOGUE
5063
5064 vmovdqu ymm0, [A1]
5065 vmovdqu ymm1, [A2]
5066 vmovdqu ymm2, [A3]
5067 %1 ymm0, ymm0, ymm1, ymm2
5068 vmovdqu [A0], ymm0
5069
5070 IEMIMPL_AVX_PROLOGUE
5071 EPILOGUE_4_ARGS
5072ENDPROC iemAImpl_ %+ %1 %+ _u256
5073%endmacro
5074
5075IEMIMPL_AVX_P_BLEND vpblendvb
5076IEMIMPL_AVX_P_BLEND vblendvps
5077IEMIMPL_AVX_P_BLEND vblendvpd
5078
5079
5080;;
5081; palignr mm1, mm2/m64 instruction.
5082;
5083; @param A0 Pointer to the first media register sized operand (output).
5084; @param A1 The second register sized operand (input).
5085; @param A2 The 8-bit immediate.
5086BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5087 PROLOGUE_3_ARGS
5088 IEMIMPL_MMX_PROLOGUE
5089
5090 movq mm0, [A0]
5091 movq mm1, A1
5092 lea T1, [.imm0 xWrtRIP]
5093 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5094 lea T1, [T1 + T0*2]
5095 call T1
5096 movq [A0], mm0
5097
5098 IEMIMPL_MMX_EPILOGUE
5099 EPILOGUE_3_ARGS
5100 %assign bImm 0
5101 %rep 256
5102.imm %+ bImm:
5103 palignr mm0, mm1, bImm
5104 ret
5105 %assign bImm bImm + 1
5106 %endrep
5107.immEnd: ; 256*6 == 0x600
5108dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5109dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5110ENDPROC iemAImpl_palignr_u64
5111
5112
5113;;
5114; SSE instructions with 8-bit immediates of the form
5115; xxx xmm1, xmm2, imm8.
5116; where the instruction encoding takes up 6 bytes.
5117;
5118; @param 1 The instruction name.
5119;
5120; @param A0 Pointer to the first media register size operand (input/output).
5121; @param A1 Pointer to the second source media register size operand (input).
5122; @param A2 The 8-bit immediate
5123;
5124%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5126 PROLOGUE_3_ARGS
5127 IEMIMPL_SSE_PROLOGUE
5128
5129 movdqu xmm0, [A0]
5130 movdqu xmm1, [A1]
5131 lea T1, [.imm0 xWrtRIP]
5132 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5133 lea T1, [T1 + T0*2]
5134 call T1
5135 movdqu [A0], xmm0
5136
5137 IEMIMPL_SSE_EPILOGUE
5138 EPILOGUE_3_ARGS
5139 %assign bImm 0
5140 %rep 256
5141.imm %+ bImm:
5142 %1 xmm0, xmm1, bImm
5143 ret
5144 int3
5145 %assign bImm bImm + 1
5146 %endrep
5147.immEnd: ; 256*8 == 0x800
5148dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5149dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5150ENDPROC iemAImpl_ %+ %1 %+ _u128
5151%endmacro
5152
5153IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5154IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5155IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5156IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5157IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5158IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5159IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5160
5161
5162;;
5163; AVX instructions with 8-bit immediates of the form
5164; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5165; where the instruction encoding takes up 6 bytes.
5166;
5167; @param 1 The instruction name.
5168; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5169;
5170; @param A0 Pointer to the destination media register size operand (output).
5171; @param A1 Pointer to the first source media register size operand (input).
5172; @param A2 Pointer to the second source media register size operand (input).
5173; @param A3 The 8-bit immediate
5174;
5175%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5176BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5177 PROLOGUE_4_ARGS
5178 IEMIMPL_AVX_PROLOGUE
5179
5180 movdqu xmm0, [A1]
5181 movdqu xmm1, [A2]
5182 lea T1, [.imm0 xWrtRIP]
5183 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5184 lea T1, [T1 + T0*2]
5185 call T1
5186 movdqu [A0], xmm0
5187
5188 IEMIMPL_AVX_EPILOGUE
5189 EPILOGUE_4_ARGS
5190 %assign bImm 0
5191 %rep 256
5192.imm %+ bImm:
5193 %1 xmm0, xmm0, xmm1, bImm
5194 ret
5195 int3
5196 %assign bImm bImm + 1
5197 %endrep
5198.immEnd: ; 256*8 == 0x800
5199dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5200dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5201ENDPROC iemAImpl_ %+ %1 %+ _u128
5202
5203 %if %2 == 1
5204BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5205 PROLOGUE_4_ARGS
5206 IEMIMPL_AVX_PROLOGUE
5207
5208 vmovdqu ymm0, [A1]
5209 vmovdqu ymm1, [A2]
5210 lea T1, [.imm0 xWrtRIP]
5211 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5212 lea T1, [T1 + T0*2]
5213 call T1
5214 vmovdqu [A0], ymm0
5215
5216 IEMIMPL_AVX_EPILOGUE
5217 EPILOGUE_4_ARGS
5218 %assign bImm 0
5219 %rep 256
5220.imm %+ bImm:
5221 %1 ymm0, ymm0, ymm1, bImm
5222 ret
5223 int3
5224 %assign bImm bImm + 1
5225 %endrep
5226.immEnd: ; 256*8 == 0x800
5227dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5228dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5229ENDPROC iemAImpl_ %+ %1 %+ _u256
5230 %endif
5231%endmacro
5232
5233IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5234IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5235IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5236IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5237IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5238
5239
5240;;
5241; Need to move this as well somewhere better?
5242;
5243struc IEMPCMPISTRXSRC
5244 .uSrc1 resd 4
5245 .uSrc2 resd 4
5246endstruc
5247
5248struc IEMPCMPESTRXSRC
5249 .uSrc1 resd 4
5250 .uSrc2 resd 4
5251 .u64Rax resd 2
5252 .u64Rdx resd 2
5253endstruc
5254
5255;;
5256; The pcmpistri instruction.
5257;
5258; @param A0 Pointer to the ECX register to store the result to (output).
5259; @param A1 Pointer to the EFLAGS register.
5260; @param A2 Pointer to the structure containing the source operands (input).
5261; @param A3 The 8-bit immediate
5262;
5263BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5264 PROLOGUE_4_ARGS
5265 IEMIMPL_SSE_PROLOGUE
5266
5267 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5268 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5269 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5270 lea T1, [.imm0 xWrtRIP]
5271 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5272 lea T1, [T1 + T0*2]
5273 call T1
5274
5275 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5276 mov [T2], ecx
5277
5278 IEMIMPL_SSE_EPILOGUE
5279 EPILOGUE_4_ARGS
5280 %assign bImm 0
5281 %rep 256
5282.imm %+ bImm:
5283 pcmpistri xmm0, xmm1, bImm
5284 ret
5285 int3
5286 %assign bImm bImm + 1
5287 %endrep
5288.immEnd: ; 256*8 == 0x800
5289dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5290dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5291ENDPROC iemAImpl_pcmpistri_u128
5292
5293;;
5294; The pcmpestri instruction.
5295;
5296; @param A0 Pointer to the ECX register to store the result to (output).
5297; @param A1 Pointer to the EFLAGS register.
5298; @param A2 Pointer to the structure containing the source operands (input).
5299; @param A3 The 8-bit immediate
5300;
5301BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5302 PROLOGUE_4_ARGS
5303 IEMIMPL_SSE_PROLOGUE
5304
5305 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5306 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5307 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5308 lea T1, [.imm0 xWrtRIP]
5309 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5310 lea T1, [T1 + T0*2]
5311 push xDX ; xDX can be A1 or A2 depending on the calling convention
5312 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5313 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5314 call T1
5315
5316 pop xDX
5317 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5318 mov [T2], ecx
5319
5320 IEMIMPL_SSE_EPILOGUE
5321 EPILOGUE_4_ARGS
5322 %assign bImm 0
5323 %rep 256
5324.imm %+ bImm:
5325 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5326 pcmpestri xmm0, xmm1, bImm
5327 ret
5328 %assign bImm bImm + 1
5329 %endrep
5330.immEnd: ; 256*8 == 0x800
5331dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5332dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5333ENDPROC iemAImpl_pcmpestri_u128
5334
5335;;
5336; The pcmpistrm instruction template.
5337;
5338; @param A0 Pointer to the XMM0 register to store the result to (output).
5339; @param A1 Pointer to the EFLAGS register.
5340; @param A2 Pointer to the structure containing the source operands (input).
5341; @param A3 The 8-bit immediate
5342;
5343BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5344 PROLOGUE_4_ARGS
5345 IEMIMPL_SSE_PROLOGUE
5346
5347 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5348 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5349 lea T1, [.imm0 xWrtRIP]
5350 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5351 lea T1, [T1 + T0*2]
5352 call T1
5353
5354 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5355 movdqu [A0], xmm0
5356
5357 IEMIMPL_SSE_EPILOGUE
5358 EPILOGUE_4_ARGS
5359 %assign bImm 0
5360 %rep 256
5361.imm %+ bImm:
5362 pcmpistrm xmm1, xmm2, bImm
5363 ret
5364 int3
5365 %assign bImm bImm + 1
5366 %endrep
5367.immEnd: ; 256*8 == 0x800
5368dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5369dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5370ENDPROC iemAImpl_pcmpistrm_u128
5371
5372;;
5373; The pcmpestrm instruction template.
5374;
5375; @param A0 Pointer to the XMM0 register to store the result to (output).
5376; @param A1 Pointer to the EFLAGS register.
5377; @param A2 Pointer to the structure containing the source operands (input).
5378; @param A3 The 8-bit immediate
5379;
5380BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5381 PROLOGUE_4_ARGS
5382 IEMIMPL_SSE_PROLOGUE
5383
5384 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5385 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5386 lea T1, [.imm0 xWrtRIP]
5387 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5388 lea T1, [T1 + T0*2]
5389 push xDX ; xDX can be A1 or A2 depending on the calling convention
5390 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5391 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5392 call T1
5393
5394 pop xDX
5395 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5396 movdqu [A0], xmm0
5397
5398 IEMIMPL_SSE_EPILOGUE
5399 EPILOGUE_4_ARGS
5400 %assign bImm 0
5401 %rep 256
5402.imm %+ bImm:
5403 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5404 pcmpestrm xmm1, xmm2, bImm
5405 ret
5406 %assign bImm bImm + 1
5407 %endrep
5408.immEnd: ; 256*8 == 0x800
5409dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5410dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5411ENDPROC iemAImpl_pcmpestrm_u128
5412
5413
5414;;
5415; pinsrw instruction.
5416;
5417; @param A0 Pointer to the first media register size operand (input/output).
5418; @param A1 The 16 bit input operand (input).
5419; @param A2 The 8-bit immediate
5420;
5421BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5422 PROLOGUE_3_ARGS
5423 IEMIMPL_SSE_PROLOGUE
5424
5425 movq mm0, [A0]
5426 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5427 lea T1, [.imm0 xWrtRIP]
5428 lea T1, [T1 + T0]
5429 call T1
5430 movq [A0], mm0
5431
5432 IEMIMPL_SSE_EPILOGUE
5433 EPILOGUE_3_ARGS
5434 %assign bImm 0
5435 %rep 256
5436.imm %+ bImm:
5437 pinsrw mm0, A1_32, bImm
5438 ret
5439 %assign bImm bImm + 1
5440 %endrep
5441.immEnd: ; 256*5 == 0x500
5442dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5443dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5444ENDPROC iemAImpl_pinsrw_u64
5445
5446BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5447 PROLOGUE_3_ARGS
5448 IEMIMPL_SSE_PROLOGUE
5449
5450 movdqu xmm0, [A0]
5451 lea T1, [.imm0 xWrtRIP]
5452 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5453 lea T1, [T1 + T0*2]
5454 call T1
5455 movdqu [A0], xmm0
5456
5457 IEMIMPL_SSE_EPILOGUE
5458 EPILOGUE_3_ARGS
5459 %assign bImm 0
5460 %rep 256
5461.imm %+ bImm:
5462 pinsrw xmm0, A1_32, bImm
5463 ret
5464 %assign bImm bImm + 1
5465 %endrep
5466.immEnd: ; 256*6 == 0x600
5467dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5468dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5469ENDPROC iemAImpl_pinsrw_u128
5470
5471;;
5472; vpinsrw instruction.
5473;
5474; @param A0 Pointer to the first media register size operand (output).
5475; @param A1 Pointer to the source media register size operand (input).
5476; @param A2 The 16 bit input operand (input).
5477; @param A3 The 8-bit immediate
5478;
5479BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5480 PROLOGUE_4_ARGS
5481 IEMIMPL_SSE_PROLOGUE
5482
5483 movdqu xmm0, [A1]
5484 lea T1, [.imm0 xWrtRIP]
5485 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5486 lea T1, [T1 + T0*2]
5487 mov A1, A2 ; A2 requires longer encoding on Windows
5488 call T1
5489 movdqu [A0], xmm0
5490
5491 IEMIMPL_SSE_EPILOGUE
5492 EPILOGUE_4_ARGS
5493 %assign bImm 0
5494 %rep 256
5495.imm %+ bImm:
5496 vpinsrw xmm0, xmm0, A1_32, bImm
5497 ret
5498 %assign bImm bImm + 1
5499 %endrep
5500.immEnd: ; 256*6 == 0x600
5501dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5502dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5503ENDPROC iemAImpl_vpinsrw_u128
5504
5505
5506;;
5507; pextrw instruction.
5508;
5509; @param A0 Pointer to the 16bit output operand (output).
5510; @param A1 Pointer to the media register size operand (input).
5511; @param A2 The 8-bit immediate
5512;
5513BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5514 PROLOGUE_3_ARGS
5515 IEMIMPL_SSE_PROLOGUE
5516
5517 movq mm0, A1
5518 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5519 lea T1, [.imm0 xWrtRIP]
5520 lea T1, [T1 + T0]
5521 call T1
5522 mov word [A0], T0_16
5523
5524 IEMIMPL_SSE_EPILOGUE
5525 EPILOGUE_3_ARGS
5526 %assign bImm 0
5527 %rep 256
5528.imm %+ bImm:
5529 pextrw T0_32, mm0, bImm
5530 ret
5531 %assign bImm bImm + 1
5532 %endrep
5533.immEnd: ; 256*5 == 0x500
5534dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5535dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5536ENDPROC iemAImpl_pextrw_u64
5537
5538BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5539 PROLOGUE_3_ARGS
5540 IEMIMPL_SSE_PROLOGUE
5541
5542 movdqu xmm0, [A1]
5543 lea T1, [.imm0 xWrtRIP]
5544 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5545 lea T1, [T1 + T0*2]
5546 call T1
5547 mov word [A0], T0_16
5548
5549 IEMIMPL_SSE_EPILOGUE
5550 EPILOGUE_3_ARGS
5551 %assign bImm 0
5552 %rep 256
5553.imm %+ bImm:
5554 pextrw T0_32, xmm0, bImm
5555 ret
5556 %assign bImm bImm + 1
5557 %endrep
5558.immEnd: ; 256*6 == 0x600
5559dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5560dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5561ENDPROC iemAImpl_pextrw_u128
5562
5563;;
5564; vpextrw instruction.
5565;
5566; @param A0 Pointer to the 16bit output operand (output).
5567; @param A1 Pointer to the source media register size operand (input).
5568; @param A2 The 8-bit immediate
5569;
5570BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5571 PROLOGUE_3_ARGS
5572 IEMIMPL_SSE_PROLOGUE
5573
5574 movdqu xmm0, [A1]
5575 lea T1, [.imm0 xWrtRIP]
5576 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5577 lea T1, [T1 + T0*2]
5578 call T1
5579 mov word [A0], T0_16
5580
5581 IEMIMPL_SSE_EPILOGUE
5582 EPILOGUE_3_ARGS
5583 %assign bImm 0
5584 %rep 256
5585.imm %+ bImm:
5586 vpextrw T0_32, xmm0, bImm
5587 ret
5588 %assign bImm bImm + 1
5589 %endrep
5590.immEnd: ; 256*6 == 0x600
5591dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5592dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5593ENDPROC iemAImpl_vpextrw_u128
5594
5595
5596;;
5597; movmskp{s,d} SSE instruction template
5598;
5599; @param 1 The SSE instruction name.
5600; @param 2 The AVX instruction name.
5601;
5602; @param A0 Pointer to the output register (output/byte sized).
5603; @param A1 Pointer to the source media register size operand (input).
5604;
5605%macro IEMIMPL_MEDIA_MOVMSK_P 2
5606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5607 PROLOGUE_2_ARGS
5608 IEMIMPL_SSE_PROLOGUE
5609
5610 movdqu xmm0, [A1]
5611 %1 T0, xmm0
5612 mov byte [A0], T0_8
5613
5614 IEMIMPL_SSE_EPILOGUE
5615 EPILOGUE_2_ARGS
5616ENDPROC iemAImpl_ %+ %1 %+ _u128
5617
5618BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5619 PROLOGUE_2_ARGS
5620 IEMIMPL_AVX_PROLOGUE
5621
5622 movdqu xmm0, [A1]
5623 %2 T0, xmm0
5624 mov byte [A0], T0_8
5625
5626 IEMIMPL_AVX_EPILOGUE
5627 EPILOGUE_2_ARGS
5628ENDPROC iemAImpl_ %+ %2 %+ _u128
5629
5630BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5631 PROLOGUE_2_ARGS
5632 IEMIMPL_AVX_PROLOGUE
5633
5634 vmovdqu ymm0, [A1]
5635 %2 T0, ymm0
5636 mov byte [A0], T0_8
5637
5638 IEMIMPL_AVX_EPILOGUE
5639 EPILOGUE_2_ARGS
5640ENDPROC iemAImpl_ %+ %2 %+ _u256
5641%endmacro
5642
5643IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5644IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5645
5646
5647;;
5648; Restores the SSE MXCSR register with the original value.
5649;
5650; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5651; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5652; @param 2 Expression giving the address of the FXSTATE of the guest.
5653;
5654; @note Restores the stack pointer.
5655;
5656%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5657 sub xSP, 4
5658 stmxcsr [xSP]
5659 mov T0_32, [xSP]
5660 add xSP, 4
5661 ; Merge the status bits into the original MXCSR value.
5662 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5663 and T0_32, X86_MXCSR_XCPT_FLAGS
5664 or T0_32, T1_32
5665 mov [%1], T0_32
5666
5667 ldmxcsr [xSP]
5668 add xSP, 4
5669%endmacro
5670
5671
5672;;
5673; cvttsd2si instruction - 32-bit variant.
5674;
5675; @param A0 FPU context (FXSTATE or XSAVEAREA).
5676; @param A1 Where to return the MXCSR value.
5677; @param A2 Pointer to the result operand (output).
5678; @param A3 Pointer to the second operand (input).
5679;
5680BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5681 PROLOGUE_4_ARGS
5682 IEMIMPL_SSE_PROLOGUE
5683 SSE_LD_FXSTATE_MXCSR A0
5684
5685 cvttsd2si T0_32, [A3]
5686 mov dword [A2], T0_32
5687
5688 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5689 IEMIMPL_SSE_EPILOGUE
5690 EPILOGUE_4_ARGS
5691ENDPROC iemAImpl_cvttsd2si_i32_r64
5692
5693;;
5694; cvttsd2si instruction - 64-bit variant.
5695;
5696; @param A0 FPU context (FXSTATE or XSAVEAREA).
5697; @param A1 Where to return the MXCSR value.
5698; @param A2 Pointer to the result operand (output).
5699; @param A3 Pointer to the second operand (input).
5700;
5701BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5702 PROLOGUE_4_ARGS
5703 IEMIMPL_SSE_PROLOGUE
5704 SSE_LD_FXSTATE_MXCSR A0
5705
5706 cvttsd2si T0, [A3]
5707 mov qword [A2], T0
5708
5709 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5710 IEMIMPL_SSE_EPILOGUE
5711 EPILOGUE_4_ARGS
5712ENDPROC iemAImpl_cvttsd2si_i64_r64
5713
5714
5715;;
5716; cvtsd2si instruction - 32-bit variant.
5717;
5718; @param A0 FPU context (FXSTATE or XSAVEAREA).
5719; @param A1 Where to return the MXCSR value.
5720; @param A2 Pointer to the result operand (output).
5721; @param A3 Pointer to the second operand (input).
5722;
5723BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5724 PROLOGUE_4_ARGS
5725 IEMIMPL_SSE_PROLOGUE
5726 SSE_LD_FXSTATE_MXCSR A0
5727
5728 cvtsd2si T0_32, [A3]
5729 mov dword [A2], T0_32
5730
5731 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5732 IEMIMPL_SSE_EPILOGUE
5733 EPILOGUE_4_ARGS
5734ENDPROC iemAImpl_cvtsd2si_i32_r64
5735
5736;;
5737; cvtsd2si instruction - 64-bit variant.
5738;
5739; @param A0 FPU context (FXSTATE or XSAVEAREA).
5740; @param A1 Where to return the MXCSR value.
5741; @param A2 Pointer to the result operand (output).
5742; @param A3 Pointer to the second operand (input).
5743;
5744BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5745 PROLOGUE_4_ARGS
5746 IEMIMPL_SSE_PROLOGUE
5747 SSE_LD_FXSTATE_MXCSR A0
5748
5749 cvtsd2si T0, [A3]
5750 mov qword [A2], T0
5751
5752 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5753 IEMIMPL_SSE_EPILOGUE
5754 EPILOGUE_4_ARGS
5755ENDPROC iemAImpl_cvtsd2si_i64_r64
5756
5757
5758;;
5759; cvttss2si instruction - 32-bit variant.
5760;
5761; @param A0 FPU context (FXSTATE or XSAVEAREA).
5762; @param A1 Where to return the MXCSR value.
5763; @param A2 Pointer to the result operand (output).
5764; @param A3 Pointer to the second operand (input).
5765;
5766BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5767 PROLOGUE_4_ARGS
5768 IEMIMPL_SSE_PROLOGUE
5769 SSE_LD_FXSTATE_MXCSR A0
5770
5771 cvttss2si T0_32, [A3]
5772 mov dword [A2], T0_32
5773
5774 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5775 IEMIMPL_SSE_EPILOGUE
5776 EPILOGUE_4_ARGS
5777ENDPROC iemAImpl_cvttss2si_i32_r32
5778
5779;;
5780; cvttss2si instruction - 64-bit variant.
5781;
5782; @param A0 FPU context (FXSTATE or XSAVEAREA).
5783; @param A1 Where to return the MXCSR value.
5784; @param A2 Pointer to the result operand (output).
5785; @param A3 Pointer to the second operand (input).
5786;
5787BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5788 PROLOGUE_4_ARGS
5789 IEMIMPL_SSE_PROLOGUE
5790 SSE_LD_FXSTATE_MXCSR A0
5791
5792 cvttss2si T0, [A3]
5793 mov qword [A2], T0
5794
5795 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5796 IEMIMPL_SSE_EPILOGUE
5797 EPILOGUE_4_ARGS
5798ENDPROC iemAImpl_cvttss2si_i64_r32
5799
5800
5801;;
5802; cvtss2si instruction - 32-bit variant.
5803;
5804; @param A0 FPU context (FXSTATE or XSAVEAREA).
5805; @param A1 Where to return the MXCSR value.
5806; @param A2 Pointer to the result operand (output).
5807; @param A3 Pointer to the second operand (input).
5808;
5809BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5810 PROLOGUE_4_ARGS
5811 IEMIMPL_SSE_PROLOGUE
5812 SSE_LD_FXSTATE_MXCSR A0
5813
5814 cvtss2si T0_32, [A3]
5815 mov dword [A2], T0_32
5816
5817 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5818 IEMIMPL_SSE_EPILOGUE
5819 EPILOGUE_4_ARGS
5820ENDPROC iemAImpl_cvtss2si_i32_r32
5821
5822;;
5823; cvtss2si instruction - 64-bit variant.
5824;
5825; @param A0 FPU context (FXSTATE or XSAVEAREA).
5826; @param A1 Where to return the MXCSR value.
5827; @param A2 Pointer to the result operand (output).
5828; @param A3 Pointer to the second operand (input).
5829;
5830BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5831 PROLOGUE_4_ARGS
5832 IEMIMPL_SSE_PROLOGUE
5833 SSE_LD_FXSTATE_MXCSR A0
5834
5835 cvtss2si T0, [A3]
5836 mov qword [A2], T0
5837
5838 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5839 IEMIMPL_SSE_EPILOGUE
5840 EPILOGUE_4_ARGS
5841ENDPROC iemAImpl_cvtss2si_i64_r32
5842
5843
5844;;
5845; cvtsi2ss instruction - 32-bit variant.
5846;
5847; @param A0 FPU context (FXSTATE or XSAVEAREA).
5848; @param A1 Where to return the MXCSR value.
5849; @param A2 Pointer to the result operand (output).
5850; @param A3 Pointer to the second operand (input).
5851;
5852BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5853 PROLOGUE_4_ARGS
5854 IEMIMPL_SSE_PROLOGUE
5855 SSE_LD_FXSTATE_MXCSR A0
5856
5857 cvtsi2ss xmm0, dword [A3]
5858 movd dword [A2], xmm0
5859
5860 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5861 IEMIMPL_SSE_EPILOGUE
5862 EPILOGUE_4_ARGS
5863ENDPROC iemAImpl_cvtsi2ss_r32_i32
5864
5865;;
5866; cvtsi2ss instruction - 64-bit variant.
5867;
5868; @param A0 FPU context (FXSTATE or XSAVEAREA).
5869; @param A1 Where to return the MXCSR value.
5870; @param A2 Pointer to the result operand (output).
5871; @param A3 Pointer to the second operand (input).
5872;
5873BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5874 PROLOGUE_4_ARGS
5875 IEMIMPL_SSE_PROLOGUE
5876 SSE_LD_FXSTATE_MXCSR A0
5877
5878 cvtsi2ss xmm0, qword [A3]
5879 movd dword [A2], xmm0
5880
5881 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5882 IEMIMPL_SSE_EPILOGUE
5883 EPILOGUE_4_ARGS
5884ENDPROC iemAImpl_cvtsi2ss_r32_i64
5885
5886
5887;;
5888; cvtsi2sd instruction - 32-bit variant.
5889;
5890; @param A0 FPU context (FXSTATE or XSAVEAREA).
5891; @param A1 Where to return the MXCSR value.
5892; @param A2 Pointer to the result operand (output).
5893; @param A3 Pointer to the second operand (input).
5894;
5895BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5896 PROLOGUE_4_ARGS
5897 IEMIMPL_SSE_PROLOGUE
5898 SSE_LD_FXSTATE_MXCSR A0
5899
5900 cvtsi2sd xmm0, dword [A3]
5901 movq [A2], xmm0
5902
5903 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5904 IEMIMPL_SSE_EPILOGUE
5905 EPILOGUE_4_ARGS
5906ENDPROC iemAImpl_cvtsi2sd_r64_i32
5907
5908;;
5909; cvtsi2sd instruction - 64-bit variant.
5910;
5911; @param A0 FPU context (FXSTATE or XSAVEAREA).
5912; @param A1 Where to return the MXCSR value.
5913; @param A2 Pointer to the result operand (output).
5914; @param A3 Pointer to the second operand (input).
5915;
5916BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5917 PROLOGUE_4_ARGS
5918 IEMIMPL_SSE_PROLOGUE
5919 SSE_LD_FXSTATE_MXCSR A0
5920
5921 cvtsi2sd xmm0, qword [A3]
5922 movq [A2], xmm0
5923
5924 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5925 IEMIMPL_SSE_EPILOGUE
5926 EPILOGUE_4_ARGS
5927ENDPROC iemAImpl_cvtsi2sd_r64_i64
5928
5929
5930;;
5931; Initialize the SSE MXCSR register using the guest value partially to
5932; account for rounding mode.
5933;
5934; @uses 4 bytes of stack to save the original value, T0.
5935; @param 1 Expression giving the address of the MXCSR register of the guest.
5936;
5937%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5938 sub xSP, 4
5939
5940 stmxcsr [xSP]
5941 mov T0_32, [%1]
5942 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5943 or T0_32, X86_MXCSR_XCPT_MASK
5944 sub xSP, 4
5945 mov [xSP], T0_32
5946 ldmxcsr [xSP]
5947 add xSP, 4
5948%endmacro
5949
5950
5951;;
5952; Restores the SSE MXCSR register with the original value.
5953;
5954; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5955; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5956;
5957; @note Restores the stack pointer.
5958;
5959%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5960 sub xSP, 4
5961 stmxcsr [xSP]
5962 mov T0_32, [xSP]
5963 add xSP, 4
5964 ; Merge the status bits into the original MXCSR value.
5965 mov T1_32, [%1]
5966 and T0_32, X86_MXCSR_XCPT_FLAGS
5967 or T0_32, T1_32
5968 mov [%1], T0_32
5969
5970 ldmxcsr [xSP]
5971 add xSP, 4
5972%endmacro
5973
5974
5975;
5976; UCOMISS (SSE)
5977;
5978; @param A0 Pointer to the MXCSR value (input/output).
5979; @param A1 Pointer to the EFLAGS value (input/output).
5980; @param A2 Pointer to the first source operand (aka readonly destination).
5981; @param A3 Pointer to the second source operand.
5982;
5983BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5984 PROLOGUE_4_ARGS
5985 IEMIMPL_SSE_PROLOGUE
5986 SSE_LD_FXSTATE_MXCSR_ONLY A0
5987
5988 movdqu xmm0, [A2]
5989 movdqu xmm1, [A3]
5990 ucomiss xmm0, xmm1
5991 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5992
5993 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5994 IEMIMPL_SSE_EPILOGUE
5995 EPILOGUE_4_ARGS
5996ENDPROC iemAImpl_ucomiss_u128
5997
5998BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5999 PROLOGUE_4_ARGS
6000 IEMIMPL_SSE_PROLOGUE
6001 SSE_LD_FXSTATE_MXCSR_ONLY A0
6002
6003 movdqu xmm0, [A2]
6004 movdqu xmm1, [A3]
6005 vucomiss xmm0, xmm1
6006 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6007
6008 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6009 IEMIMPL_SSE_EPILOGUE
6010 EPILOGUE_4_ARGS
6011ENDPROC iemAImpl_vucomiss_u128
6012
6013
6014;
6015; UCOMISD (SSE)
6016;
6017; @param A0 Pointer to the MXCSR value (input/output).
6018; @param A1 Pointer to the EFLAGS value (input/output).
6019; @param A2 Pointer to the first source operand (aka readonly destination).
6020; @param A3 Pointer to the second source operand.
6021;
6022BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6023 PROLOGUE_4_ARGS
6024 IEMIMPL_SSE_PROLOGUE
6025 SSE_LD_FXSTATE_MXCSR_ONLY A0
6026
6027 movdqu xmm0, [A2]
6028 movdqu xmm1, [A3]
6029 ucomisd xmm0, xmm1
6030 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6031
6032 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6033 IEMIMPL_SSE_EPILOGUE
6034 EPILOGUE_4_ARGS
6035ENDPROC iemAImpl_ucomisd_u128
6036
6037BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6038 PROLOGUE_4_ARGS
6039 IEMIMPL_SSE_PROLOGUE
6040 SSE_LD_FXSTATE_MXCSR_ONLY A0
6041
6042 movdqu xmm0, [A2]
6043 movdqu xmm1, [A3]
6044 vucomisd xmm0, xmm1
6045 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6046
6047 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6048 IEMIMPL_SSE_EPILOGUE
6049 EPILOGUE_4_ARGS
6050ENDPROC iemAImpl_vucomisd_u128
6051
6052;
6053; COMISS (SSE)
6054;
6055; @param A0 Pointer to the MXCSR value (input/output).
6056; @param A1 Pointer to the EFLAGS value (input/output).
6057; @param A2 Pointer to the first source operand (aka readonly destination).
6058; @param A3 Pointer to the second source operand.
6059;
6060BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6061 PROLOGUE_4_ARGS
6062 IEMIMPL_SSE_PROLOGUE
6063 SSE_LD_FXSTATE_MXCSR_ONLY A0
6064
6065 movdqu xmm0, [A2]
6066 movdqu xmm1, [A3]
6067 comiss xmm0, xmm1
6068 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6069
6070 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6071 IEMIMPL_SSE_EPILOGUE
6072 EPILOGUE_4_ARGS
6073ENDPROC iemAImpl_comiss_u128
6074
6075BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6076 PROLOGUE_4_ARGS
6077 IEMIMPL_SSE_PROLOGUE
6078 SSE_LD_FXSTATE_MXCSR_ONLY A0
6079
6080 movdqu xmm0, [A2]
6081 movdqu xmm1, [A3]
6082 vcomiss xmm0, xmm1
6083 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6084
6085 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6086 IEMIMPL_SSE_EPILOGUE
6087 EPILOGUE_4_ARGS
6088ENDPROC iemAImpl_vcomiss_u128
6089
6090
6091;
6092; COMISD (SSE)
6093;
6094; @param A0 Pointer to the MXCSR value (input/output).
6095; @param A1 Pointer to the EFLAGS value (input/output).
6096; @param A2 Pointer to the first source operand (aka readonly destination).
6097; @param A3 Pointer to the second source operand.
6098;
6099BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6100 PROLOGUE_4_ARGS
6101 IEMIMPL_SSE_PROLOGUE
6102 SSE_LD_FXSTATE_MXCSR_ONLY A0
6103
6104 movdqu xmm0, [A2]
6105 movdqu xmm1, [A3]
6106 comisd xmm0, xmm1
6107 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6108
6109 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6110 IEMIMPL_SSE_EPILOGUE
6111 EPILOGUE_4_ARGS
6112ENDPROC iemAImpl_comisd_u128
6113
6114BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6115 PROLOGUE_4_ARGS
6116 IEMIMPL_SSE_PROLOGUE
6117 SSE_LD_FXSTATE_MXCSR_ONLY A0
6118
6119 movdqu xmm0, [A2]
6120 movdqu xmm1, [A3]
6121 vcomisd xmm0, xmm1
6122 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6123
6124 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6125 IEMIMPL_SSE_EPILOGUE
6126 EPILOGUE_4_ARGS
6127ENDPROC iemAImpl_vcomisd_u128
6128
6129
6130;;
6131; Need to move this as well somewhere better?
6132;
6133struc IEMMEDIAF2XMMSRC
6134 .uSrc1 resd 4
6135 .uSrc2 resd 4
6136endstruc
6137
6138
6139;
6140; CMPPS (SSE)
6141;
6142; @param A0 Pointer to the MXCSR value (input/output).
6143; @param A1 Pointer to the first media register size operand (output).
6144; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6145; @param A3 The 8-bit immediate (input).
6146;
6147BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6148 PROLOGUE_4_ARGS
6149 IEMIMPL_SSE_PROLOGUE
6150 SSE_LD_FXSTATE_MXCSR_ONLY A0
6151
6152 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6153 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6154 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6155 lea T1, [.imm0 xWrtRIP]
6156 lea T1, [T1 + T0]
6157 call T1
6158 movdqu [A1], xmm0
6159
6160 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6161 IEMIMPL_SSE_EPILOGUE
6162 EPILOGUE_4_ARGS
6163 %assign bImm 0
6164 %rep 256
6165.imm %+ bImm:
6166 cmpps xmm0, xmm1, bImm
6167 ret
6168 %assign bImm bImm + 1
6169 %endrep
6170.immEnd: ; 256*5 == 0x500
6171dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6172dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6173ENDPROC iemAImpl_cmpps_u128
6174
6175;;
6176; SSE instructions with 8-bit immediates of the form
6177; xxx xmm1, xmm2, imm8.
6178; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6179; register.
6180;
6181; @param 1 The instruction name.
6182;
6183; @param A0 Pointer to the MXCSR value (input/output).
6184; @param A1 Pointer to the first media register size operand (output).
6185; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6186; @param A3 The 8-bit immediate (input).
6187;
6188%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6189BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6190 PROLOGUE_4_ARGS
6191 IEMIMPL_SSE_PROLOGUE
6192 SSE_LD_FXSTATE_MXCSR_ONLY A0
6193
6194 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6195 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6196 lea T1, [.imm0 xWrtRIP]
6197 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6198 lea T1, [T1 + T0*2]
6199 call T1
6200 movdqu [A1], xmm0
6201
6202 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6203 IEMIMPL_SSE_EPILOGUE
6204 EPILOGUE_4_ARGS
6205 %assign bImm 0
6206 %rep 256
6207.imm %+ bImm:
6208 %1 xmm0, xmm1, bImm
6209 ret
6210 %assign bImm bImm + 1
6211 %endrep
6212.immEnd: ; 256*6 == 0x600
6213dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6214dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6215ENDPROC iemAImpl_ %+ %1 %+ _u128
6216%endmacro
6217
6218IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6219IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6220IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6221
6222;;
6223; SSE instructions with 8-bit immediates of the form
6224; xxx xmm1, xmm2, imm8.
6225; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6226; register.
6227;
6228; @param 1 The instruction name.
6229;
6230; @param A0 Pointer to the MXCSR value (input/output).
6231; @param A1 Pointer to the first media register size operand (output).
6232; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6233; @param A3 The 8-bit immediate (input).
6234;
6235%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6236BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6237 PROLOGUE_4_ARGS
6238 IEMIMPL_SSE_PROLOGUE
6239 SSE_LD_FXSTATE_MXCSR_ONLY A0
6240
6241 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6242 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6243 lea T1, [.imm0 xWrtRIP]
6244 lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3
6245 lea T0, [T0*2]
6246 lea T0, [T0 + A3]
6247 lea T1, [T1 + T0]
6248 call T1
6249 movdqu [A1], xmm0
6250
6251 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6252 IEMIMPL_SSE_EPILOGUE
6253 EPILOGUE_4_ARGS
6254 %assign bImm 0
6255 %rep 256
6256.imm %+ bImm:
6257 %1 xmm0, xmm1, bImm
6258 ret
6259 %assign bImm bImm + 1
6260 %endrep
6261.immEnd: ; 256*(6+1) == 0x700
6262dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6263dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6264ENDPROC iemAImpl_ %+ %1 %+ _u128
6265%endmacro
6266
6267IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6268IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6269IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6270IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6271IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6272IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6273
6274
6275;;
6276; SSE instructions of the form
6277; xxx mm, xmm.
6278; and we need to load and save the MXCSR register.
6279;
6280; @param 1 The instruction name.
6281;
6282; @param A0 Pointer to the MXCSR value (input/output).
6283; @param A1 Pointer to the first MMX register sized operand (output).
6284; @param A2 Pointer to the media register sized operand (input).
6285;
6286%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6288 PROLOGUE_3_ARGS
6289 IEMIMPL_SSE_PROLOGUE
6290 SSE_LD_FXSTATE_MXCSR_ONLY A0
6291
6292 movdqu xmm0, [A2]
6293 %1 mm0, xmm0
6294 movq [A1], mm0
6295
6296 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6297 IEMIMPL_SSE_EPILOGUE
6298 EPILOGUE_3_ARGS
6299ENDPROC iemAImpl_ %+ %1 %+ _u128
6300%endmacro
6301
6302IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6303IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6304
6305;;
6306; SSE instructions of the form
6307; xxx xmm, xmm/m64.
6308; and we need to load and save the MXCSR register.
6309;
6310; @param 1 The instruction name.
6311;
6312; @param A0 Pointer to the MXCSR value (input/output).
6313; @param A1 Pointer to the first media register sized operand (input/output).
6314; @param A2 The 64bit source value from a MMX media register (input)
6315;
6316%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6317BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6318 PROLOGUE_3_ARGS
6319 IEMIMPL_SSE_PROLOGUE
6320 SSE_LD_FXSTATE_MXCSR_ONLY A0
6321
6322 movdqu xmm0, [A1]
6323 movq mm0, A2
6324 %1 xmm0, mm0
6325 movdqu [A1], xmm0
6326
6327 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6328 IEMIMPL_SSE_EPILOGUE
6329 EPILOGUE_3_ARGS
6330ENDPROC iemAImpl_ %+ %1 %+ _u128
6331%endmacro
6332
6333IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6334IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6335
6336;;
6337; SSE instructions of the form
6338; xxx mm, xmm/m64.
6339; and we need to load and save the MXCSR register.
6340;
6341; @param 1 The instruction name.
6342;
6343; @param A0 Pointer to the MXCSR value (input/output).
6344; @param A1 Pointer to the first MMX media register sized operand (output).
6345; @param A2 The 64bit source value (input).
6346;
6347%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6348BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6349 PROLOGUE_3_ARGS
6350 IEMIMPL_SSE_PROLOGUE
6351 SSE_LD_FXSTATE_MXCSR_ONLY A0
6352
6353 movq xmm0, A2
6354 %1 mm0, xmm0
6355 movq [A1], mm0
6356
6357 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6358 IEMIMPL_SSE_EPILOGUE
6359 EPILOGUE_3_ARGS
6360ENDPROC iemAImpl_ %+ %1 %+ _u128
6361%endmacro
6362
6363IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6364IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6365
6366;
6367; All forms of RDRAND and RDSEED
6368;
6369; @param A0 Pointer to the destination operand.
6370; @param A1 Pointer to the EFLAGS value (input/output).
6371;
6372%macro IEMIMPL_RDRAND_RDSEED 3
6373BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6374 PROLOGUE_2_ARGS
6375
6376 %1 %2
6377 mov [A0], %2
6378 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6379
6380 EPILOGUE_2_ARGS
6381ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6382%endmacro
6383
6384IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6385IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6386IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6387IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6388IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6389IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6390
6391
6392;;
6393; sha1rnds4 xmm1, xmm2, imm8.
6394;
6395; @param 1 The instruction name.
6396;
6397; @param A0 Pointer to the first media register size operand (input/output).
6398; @param A1 Pointer to the second source media register size operand (input).
6399; @param A2 The 8-bit immediate
6400;
6401BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6402 PROLOGUE_3_ARGS
6403 IEMIMPL_SSE_PROLOGUE
6404
6405 movdqu xmm0, [A0]
6406 movdqu xmm1, [A1]
6407 lea T1, [.imm0 xWrtRIP]
6408 lea T0, [A2 + A2*2] ; sizeof(insnX+ret) == 6: (A2 * 3) * 2
6409 lea T1, [T1 + T0*2]
6410 call T1
6411 movdqu [A0], xmm0
6412
6413 IEMIMPL_SSE_EPILOGUE
6414 EPILOGUE_3_ARGS
6415 %assign bImm 0
6416 %rep 256
6417.imm %+ bImm:
6418 sha1rnds4 xmm0, xmm1, bImm
6419 ret
6420 %assign bImm bImm + 1
6421 %endrep
6422.immEnd: ; 256*6 == 0x600
6423dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6424dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6425ENDPROC iemAImpl_sha1rnds4_u128
6426
6427
6428;;
6429; sha256rnds2 xmm1, xmm2, <XMM0>.
6430;
6431; @param 1 The instruction name.
6432;
6433; @param A0 Pointer to the first media register size operand (input/output).
6434; @param A1 Pointer to the second source media register size operand (input).
6435; @param A2 Pointer to the implicit XMM0 constants (input).
6436;
6437BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6438 PROLOGUE_3_ARGS
6439 IEMIMPL_SSE_PROLOGUE
6440
6441 movdqu xmm0, [A2]
6442 movdqu xmm1, [A0]
6443 movdqu xmm2, [A1]
6444 sha256rnds2 xmm1, xmm2
6445 movdqu [A0], xmm1
6446
6447 IEMIMPL_SSE_EPILOGUE
6448 EPILOGUE_3_ARGS
6449ENDPROC iemAImpl_sha256rnds2_u128
6450
6451
6452;
6453; 32-bit forms of ADCX and ADOX
6454;
6455; @param A0 Pointer to the destination operand (input/output).
6456; @param A1 Pointer to the EFLAGS value (input/output).
6457; @param A2 32-bit source operand 1 (input).
6458;
6459%macro IEMIMPL_ADX_32 2
6460BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6461 PROLOGUE_4_ARGS
6462
6463 IEM_LOAD_FLAGS A1, %2, 0
6464 %1 A2_32, [A0]
6465 mov [A0], A2_32
6466 IEM_SAVE_FLAGS A1, %2, 0
6467
6468 EPILOGUE_4_ARGS
6469ENDPROC iemAImpl_ %+ %1 %+ _u32
6470%endmacro
6471
6472;
6473; 64-bit forms of ADCX and ADOX
6474;
6475; @param A0 Pointer to the destination operand (input/output).
6476; @param A1 Pointer to the EFLAGS value (input/output).
6477; @param A2 64-bit source operand 1 (input).
6478;
6479%macro IEMIMPL_ADX_64 2
6480BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6481 PROLOGUE_4_ARGS
6482
6483 IEM_LOAD_FLAGS A1, %2, 0
6484 %1 A2, [A0]
6485 mov [A0], A2
6486 IEM_SAVE_FLAGS A1, %2, 0
6487
6488 EPILOGUE_4_ARGS
6489ENDPROC iemAImpl_ %+ %1 %+ _u64
6490%endmacro
6491
6492IEMIMPL_ADX_32 adcx, X86_EFL_CF
6493IEMIMPL_ADX_64 adcx, X86_EFL_CF
6494
6495IEMIMPL_ADX_32 adox, X86_EFL_OF
6496IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette