VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 99681

Last change on this file since 99681 was 98921, checked in by vboxsync, 22 months ago

VMM/IEM: Started implementing the dpps/dppd instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 185.0 KB
Line 
1; $Id: IEMAllAImpl.asm 98921 2023-03-12 16:54:45Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Load the relevant flags from [%1].
305;
306; @remarks Clobbers T0, stack. Changes EFLAGS.
307; @param A2 The register pointing to the flags.
308; @param 1 The parameter (A0..A3) pointing to the eflags.
309; @param 2 The set of flags to load.
310; @param 3 The set of undefined flags.
311;
312%macro IEM_LOAD_FLAGS 3
313 pushf ; store current flags
314 mov T0_32, [%1] ; load the guest flags
315 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
316 and T0_32, (%2 | %3) ; select the modified and undefined flags.
317 or [xSP], T0 ; merge guest flags with host flags.
318 popf ; load the mixed flags.
319%endmacro
320
321;;
322; Update the flag.
323;
324; @remarks Clobbers T0, T1, stack.
325; @param 1 The register pointing to the EFLAGS.
326; @param 2 The mask of modified flags to save.
327; @param 3 The mask of undefined flags to (maybe) save.
328;
329%macro IEM_SAVE_FLAGS 3
330 %if (%2 | %3) != 0
331 pushf
332 pop T1
333 mov T0_32, [%1] ; flags
334 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
335 and T1_32, (%2 | %3) ; select the modified and undefined flags.
336 or T0_32, T1_32 ; combine the flags.
337 mov [%1], T0_32 ; save the flags.
338 %endif
339%endmacro
340
341;;
342; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
343;
344; @remarks Clobbers T0, T1, stack.
345; @param 1 The register pointing to the EFLAGS.
346; @param 2 The mask of modified flags to save.
347; @param 3 Mask of additional flags to always clear
348; @param 4 Mask of additional flags to always set.
349;
350%macro IEM_SAVE_AND_ADJUST_FLAGS 4
351 %if (%2 | %3 | %4) != 0
352 pushf
353 pop T1
354 mov T0_32, [%1] ; load flags.
355 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
356 and T1_32, (%2) ; select the modified flags.
357 or T0_32, T1_32 ; combine the flags.
358 %if (%4) != 0
359 or T0_32, %4 ; add the always set flags.
360 %endif
361 mov [%1], T0_32 ; save the result.
362 %endif
363%endmacro
364
365;;
366; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
367; signed input (%4[%5]) and parity index (%6).
368;
369; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
370; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
371; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
372;
373; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
374; @param 1 The register pointing to the EFLAGS.
375; @param 2 The mask of modified flags to save.
376; @param 3 Mask of additional flags to always clear
377; @param 4 The result register to set SF by.
378; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
379; @param 6 The (full) register containing the parity table index. Will be modified!
380
381%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
382 %ifdef RT_ARCH_AMD64
383 pushf
384 pop T2
385 %else
386 push T0
387 pushf
388 pop T0
389 %endif
390 mov T1_32, [%1] ; load flags.
391 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
392 %ifdef RT_ARCH_AMD64
393 and T2_32, (%2) ; select the modified flags.
394 or T1_32, T2_32 ; combine the flags.
395 %else
396 and T0_32, (%2) ; select the modified flags.
397 or T1_32, T0_32 ; combine the flags.
398 pop T0
399 %endif
400
401 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
402 bt %4, %5 - 1
403 jnc %%sf_clear
404 or T1_32, X86_EFL_SF
405 %%sf_clear:
406
407 ; Parity last.
408 and %6, 0xff
409 %ifdef RT_ARCH_AMD64
410 lea T2, [NAME(g_afParity) xWrtRIP]
411 or T1_8, [T2 + %6]
412 %else
413 or T1_8, [NAME(g_afParity) + %6]
414 %endif
415
416 mov [%1], T1_32 ; save the result.
417%endmacro
418
419;;
420; Calculates the new EFLAGS using fixed clear and set bit masks.
421;
422; @remarks Clobbers T0.
423; @param 1 The register pointing to the EFLAGS.
424; @param 2 Mask of additional flags to always clear
425; @param 3 Mask of additional flags to always set.
426;
427%macro IEM_ADJUST_FLAGS 3
428 %if (%2 | %3) != 0
429 mov T0_32, [%1] ; Load flags.
430 %if (%2) != 0
431 and T0_32, ~(%2) ; Remove the always cleared flags.
432 %endif
433 %if (%3) != 0
434 or T0_32, %3 ; Add the always set flags.
435 %endif
436 mov [%1], T0_32 ; Save the result.
437 %endif
438%endmacro
439
440;;
441; Calculates the new EFLAGS using fixed clear and set bit masks.
442;
443; @remarks Clobbers T0, %4, EFLAGS.
444; @param 1 The register pointing to the EFLAGS.
445; @param 2 Mask of additional flags to always clear
446; @param 3 Mask of additional flags to always set.
447; @param 4 The (full) register containing the parity table index. Will be modified!
448;
449%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
450 mov T0_32, [%1] ; Load flags.
451 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
452 %if (%3) != 0
453 or T0_32, %3 ; Add the always set flags.
454 %endif
455 and %4, 0xff
456 %ifdef RT_ARCH_AMD64
457 lea T2, [NAME(g_afParity) xWrtRIP]
458 or T0_8, [T2 + %4]
459 %else
460 or T0_8, [NAME(g_afParity) + %4]
461 %endif
462 mov [%1], T0_32 ; Save the result.
463%endmacro
464
465
466;*********************************************************************************************************************************
467;* External Symbols *
468;*********************************************************************************************************************************
469extern NAME(g_afParity)
470
471
472;;
473; Macro for implementing a binary operator.
474;
475; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
476; variants, except on 32-bit system where the 64-bit accesses requires hand
477; coding.
478;
479; All the functions takes a pointer to the destination memory operand in A0,
480; the source register operand in A1 and a pointer to eflags in A2.
481;
482; @param 1 The instruction mnemonic.
483; @param 2 Non-zero if there should be a locked version.
484; @param 3 The modified flags.
485; @param 4 The undefined flags.
486;
487%macro IEMIMPL_BIN_OP 4
488BEGINCODE
489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
490 PROLOGUE_3_ARGS
491 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
492 %1 byte [A0], A1_8
493 IEM_SAVE_FLAGS A2, %3, %4
494 EPILOGUE_3_ARGS
495ENDPROC iemAImpl_ %+ %1 %+ _u8
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 %1 word [A0], A1_16
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u16
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 %1 dword [A0], A1_32
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u32
512
513 %ifdef RT_ARCH_AMD64
514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
515 PROLOGUE_3_ARGS
516 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
517 %1 qword [A0], A1
518 IEM_SAVE_FLAGS A2, %3, %4
519 EPILOGUE_3_ARGS_EX 8
520ENDPROC iemAImpl_ %+ %1 %+ _u64
521 %endif ; RT_ARCH_AMD64
522
523 %if %2 != 0 ; locked versions requested?
524
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
528 lock %1 byte [A0], A1_8
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS
531ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
532
533BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
534 PROLOGUE_3_ARGS
535 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
536 lock %1 word [A0], A1_16
537 IEM_SAVE_FLAGS A2, %3, %4
538 EPILOGUE_3_ARGS
539ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
540
541BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
542 PROLOGUE_3_ARGS
543 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
544 lock %1 dword [A0], A1_32
545 IEM_SAVE_FLAGS A2, %3, %4
546 EPILOGUE_3_ARGS
547ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
548
549 %ifdef RT_ARCH_AMD64
550BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
551 PROLOGUE_3_ARGS
552 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
553 lock %1 qword [A0], A1
554 IEM_SAVE_FLAGS A2, %3, %4
555 EPILOGUE_3_ARGS_EX 8
556ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
557 %endif ; RT_ARCH_AMD64
558 %endif ; locked
559%endmacro
560
561; instr,lock, modified-flags, undefined flags
562IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
563IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
564IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
565IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
566IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
567IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
568IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
569IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
570IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
571
572
573;;
574; Macro for implementing a binary operator, VEX variant with separate input/output.
575;
576; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
577; where the 64-bit accesses requires hand coding.
578;
579; All the functions takes a pointer to the destination memory operand in A0,
580; the first source register operand in A1, the second source register operand
581; in A2 and a pointer to eflags in A3.
582;
583; @param 1 The instruction mnemonic.
584; @param 2 The modified flags.
585; @param 3 The undefined flags.
586;
587%macro IEMIMPL_VEX_BIN_OP 3
588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
589 PROLOGUE_4_ARGS
590 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
591 %1 T0_32, A1_32, A2_32
592 mov [A0], T0_32
593 IEM_SAVE_FLAGS A3, %2, %3
594 EPILOGUE_4_ARGS
595ENDPROC iemAImpl_ %+ %1 %+ _u32
596
597 %ifdef RT_ARCH_AMD64
598BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
599 PROLOGUE_4_ARGS
600 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
601 %1 T0, A1, A2
602 mov [A0], T0
603 IEM_SAVE_FLAGS A3, %2, %3
604 EPILOGUE_4_ARGS
605ENDPROC iemAImpl_ %+ %1 %+ _u64
606 %endif ; RT_ARCH_AMD64
607%endmacro
608
609; instr, modified-flags, undefined-flags
610IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
611IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
612IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
613
614;;
615; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
616;
617; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
618; where the 64-bit accesses requires hand coding.
619;
620; All the functions takes a pointer to the destination memory operand in A0,
621; the source register operand in A1 and a pointer to eflags in A2.
622;
623; @param 1 The instruction mnemonic.
624; @param 2 The modified flags.
625; @param 3 The undefined flags.
626;
627%macro IEMIMPL_VEX_BIN_OP_2 3
628BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
629 PROLOGUE_4_ARGS
630 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
631 mov T0_32, [A0]
632 %1 T0_32, A1_32
633 mov [A0], T0_32
634 IEM_SAVE_FLAGS A2, %2, %3
635 EPILOGUE_4_ARGS
636ENDPROC iemAImpl_ %+ %1 %+ _u32
637
638 %ifdef RT_ARCH_AMD64
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0, [A0]
643 %1 T0, A1
644 mov [A0], T0
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u64
648 %endif ; RT_ARCH_AMD64
649%endmacro
650
651; instr, modified-flags, undefined-flags
652IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
653IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
654IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
655
656
657;;
658; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
659;
660; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
661; where the 64-bit accesses requires hand coding.
662;
663; All the functions takes a pointer to the destination memory operand in A0,
664; the first source register operand in A1, the second source register operand
665; in A2 and a pointer to eflags in A3.
666;
667; @param 1 The instruction mnemonic.
668; @param 2 Fallback instruction if applicable.
669; @param 3 Whether to emit fallback or not.
670;
671%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
673 PROLOGUE_3_ARGS
674 %1 T0_32, A1_32, A2_32
675 mov [A0], T0_32
676 EPILOGUE_3_ARGS
677ENDPROC iemAImpl_ %+ %1 %+ _u32
678
679 %if %3
680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
681 PROLOGUE_3_ARGS
682 %ifdef ASM_CALL64_GCC
683 mov cl, A2_8
684 %2 A1_32, cl
685 mov [A0], A1_32
686 %else
687 xchg A2, A0
688 %2 A1_32, cl
689 mov [A2], A1_32
690 %endif
691 EPILOGUE_3_ARGS
692ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
693 %endif
694
695 %ifdef RT_ARCH_AMD64
696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
697 PROLOGUE_3_ARGS
698 %1 T0, A1, A2
699 mov [A0], T0
700 EPILOGUE_3_ARGS
701ENDPROC iemAImpl_ %+ %1 %+ _u64
702
703 %if %3
704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
705 PROLOGUE_3_ARGS
706 %ifdef ASM_CALL64_GCC
707 mov cl, A2_8
708 %2 A1, cl
709 mov [A0], A1_32
710 %else
711 xchg A2, A0
712 %2 A1, cl
713 mov [A2], A1_32
714 %endif
715 mov [A0], A1
716 EPILOGUE_3_ARGS
717ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
718 %endif
719 %endif ; RT_ARCH_AMD64
720%endmacro
721
722; instr, fallback instr, emit fallback
723IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
724IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
725IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
726IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
727IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
728
729
730;
731; RORX uses a immediate byte for the shift count, so we only do
732; fallback implementation of that one.
733;
734BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
735 PROLOGUE_3_ARGS
736 %ifdef ASM_CALL64_GCC
737 mov cl, A2_8
738 ror A1_32, cl
739 mov [A0], A1_32
740 %else
741 xchg A2, A0
742 ror A1_32, cl
743 mov [A2], A1_32
744 %endif
745 EPILOGUE_3_ARGS
746ENDPROC iemAImpl_rorx_u32
747
748 %ifdef RT_ARCH_AMD64
749BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
750 PROLOGUE_3_ARGS
751 %ifdef ASM_CALL64_GCC
752 mov cl, A2_8
753 ror A1, cl
754 mov [A0], A1_32
755 %else
756 xchg A2, A0
757 ror A1, cl
758 mov [A2], A1_32
759 %endif
760 mov [A0], A1
761 EPILOGUE_3_ARGS
762ENDPROC iemAImpl_rorx_u64
763 %endif ; RT_ARCH_AMD64
764
765
766;
767; MULX
768;
769BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX - prefect
773 mulx T0_32, T1_32, A3_32
774 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
775 mov [A0], T0_32
776%else
777 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
778 xchg A1, A2
779 mulx T0_32, T1_32, A3_32
780 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
781 mov [A0], T0_32
782%endif
783 EPILOGUE_4_ARGS
784ENDPROC iemAImpl_mulx_u32
785
786
787BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
788 PROLOGUE_4_ARGS
789%ifdef ASM_CALL64_GCC
790 ; A2_32 is EDX, T0_32 is EAX
791 mov eax, A3_32
792 mul A2_32
793 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
794 mov [A0], edx
795%else
796 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
797 xchg A1, A2
798 mov eax, A3_32
799 mul A2_32
800 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], edx
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u32_fallback
805
806%ifdef RT_ARCH_AMD64
807BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX - prefect
811 mulx T0, T1, A3
812 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
813 mov [A0], T0
814%else
815 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
816 xchg A1, A2
817 mulx T0, T1, A3
818 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
819 mov [A0], T0
820%endif
821 EPILOGUE_4_ARGS
822ENDPROC iemAImpl_mulx_u64
823
824
825BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
826 PROLOGUE_4_ARGS
827%ifdef ASM_CALL64_GCC
828 ; A2 is RDX, T0 is RAX
829 mov rax, A3
830 mul A2
831 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
832 mov [A0], rdx
833%else
834 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
835 xchg A1, A2
836 mov rax, A3
837 mul A2
838 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
839 mov [A0], rdx
840%endif
841 EPILOGUE_4_ARGS
842ENDPROC iemAImpl_mulx_u64_fallback
843
844%endif
845
846
847;;
848; Macro for implementing a bit operator.
849;
850; This will generate code for the 16, 32 and 64 bit accesses with locked
851; variants, except on 32-bit system where the 64-bit accesses requires hand
852; coding.
853;
854; All the functions takes a pointer to the destination memory operand in A0,
855; the source register operand in A1 and a pointer to eflags in A2.
856;
857; @param 1 The instruction mnemonic.
858; @param 2 Non-zero if there should be a locked version.
859; @param 3 The modified flags.
860; @param 4 The undefined flags.
861;
862%macro IEMIMPL_BIT_OP 4
863BEGINCODE
864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
865 PROLOGUE_3_ARGS
866 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
867 %1 word [A0], A1_16
868 IEM_SAVE_FLAGS A2, %3, %4
869 EPILOGUE_3_ARGS
870ENDPROC iemAImpl_ %+ %1 %+ _u16
871
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 %1 dword [A0], A1_32
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u32
879
880 %ifdef RT_ARCH_AMD64
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 qword [A0], A1
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS_EX 8
887ENDPROC iemAImpl_ %+ %1 %+ _u64
888 %endif ; RT_ARCH_AMD64
889
890 %if %2 != 0 ; locked versions requested?
891
892BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
893 PROLOGUE_3_ARGS
894 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
895 lock %1 word [A0], A1_16
896 IEM_SAVE_FLAGS A2, %3, %4
897 EPILOGUE_3_ARGS
898ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
899
900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
901 PROLOGUE_3_ARGS
902 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
903 lock %1 dword [A0], A1_32
904 IEM_SAVE_FLAGS A2, %3, %4
905 EPILOGUE_3_ARGS
906ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
907
908 %ifdef RT_ARCH_AMD64
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 qword [A0], A1
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS_EX 8
915ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
916 %endif ; RT_ARCH_AMD64
917 %endif ; locked
918%endmacro
919IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
920IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
921IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
922IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
923
924;;
925; Macro for implementing a bit search operator.
926;
927; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
928; system where the 64-bit accesses requires hand coding.
929;
930; All the functions takes a pointer to the destination memory operand in A0,
931; the source register operand in A1 and a pointer to eflags in A2.
932;
933; In the ZF case the destination register is 'undefined', however it seems that
934; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
935; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
936; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
937; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
938;
939; @param 1 The instruction mnemonic.
940; @param 2 The modified flags.
941; @param 3 The undefined flags.
942; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
943;
944%macro IEMIMPL_BIT_OP2 4
945BEGINCODE
946BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
947 PROLOGUE_3_ARGS
948 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
949 %1 T0_16, A1_16
950%if %4 != 0
951 jz .unchanged_dst
952%endif
953 mov [A0], T0_16
954.unchanged_dst:
955 IEM_SAVE_FLAGS A2, %2, %3
956 EPILOGUE_3_ARGS
957ENDPROC iemAImpl_ %+ %1 %+ _u16
958
959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
960 PROLOGUE_3_ARGS
961 %1 T1_16, A1_16
962%if %4 != 0
963 jz .unchanged_dst
964%endif
965 mov [A0], T1_16
966 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
967 EPILOGUE_3_ARGS
968.unchanged_dst:
969 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
970 EPILOGUE_3_ARGS
971ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
972
973BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
974 PROLOGUE_3_ARGS
975 %1 T0_16, A1_16
976%if %4 != 0
977 jz .unchanged_dst
978%endif
979 mov [A0], T0_16
980.unchanged_dst:
981 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
982 EPILOGUE_3_ARGS
983ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
984
985
986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
987 PROLOGUE_3_ARGS
988 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
989 %1 T0_32, A1_32
990%if %4 != 0
991 jz .unchanged_dst
992%endif
993 mov [A0], T0_32
994.unchanged_dst:
995 IEM_SAVE_FLAGS A2, %2, %3
996 EPILOGUE_3_ARGS
997ENDPROC iemAImpl_ %+ %1 %+ _u32
998
999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1000 PROLOGUE_3_ARGS
1001 %1 T1_32, A1_32
1002%if %4 != 0
1003 jz .unchanged_dst
1004%endif
1005 mov [A0], T1_32
1006 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1007 EPILOGUE_3_ARGS
1008.unchanged_dst:
1009 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1012
1013BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1014 PROLOGUE_3_ARGS
1015 %1 T0_32, A1_32
1016%if %4 != 0
1017 jz .unchanged_dst
1018%endif
1019 mov [A0], T0_32
1020.unchanged_dst:
1021 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1022 EPILOGUE_3_ARGS
1023ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1024
1025
1026 %ifdef RT_ARCH_AMD64
1027
1028BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1029 PROLOGUE_3_ARGS
1030 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1031 %1 T0, A1
1032%if %4 != 0
1033 jz .unchanged_dst
1034%endif
1035 mov [A0], T0
1036.unchanged_dst:
1037 IEM_SAVE_FLAGS A2, %2, %3
1038 EPILOGUE_3_ARGS_EX 8
1039ENDPROC iemAImpl_ %+ %1 %+ _u64
1040
1041BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1042 PROLOGUE_3_ARGS
1043 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1044 %1 T1, A1
1045%if %4 != 0
1046 jz .unchanged_dst
1047%endif
1048 mov [A0], T1
1049 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1050 EPILOGUE_3_ARGS
1051.unchanged_dst:
1052 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1053 EPILOGUE_3_ARGS
1054ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1055
1056BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1057 PROLOGUE_3_ARGS
1058 %1 T0, A1
1059%if %4 != 0
1060 jz .unchanged_dst
1061%endif
1062 mov [A0], T0
1063.unchanged_dst:
1064 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1065 EPILOGUE_3_ARGS_EX 8
1066ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1067
1068 %endif ; RT_ARCH_AMD64
1069%endmacro
1070
1071IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1072IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1073IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1074IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1075
1076
1077;;
1078; Macro for implementing POPCNT.
1079;
1080; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1081; system where the 64-bit accesses requires hand coding.
1082;
1083; All the functions takes a pointer to the destination memory operand in A0,
1084; the source register operand in A1 and a pointer to eflags in A2.
1085;
1086; ASSUMES Intel and AMD set EFLAGS the same way.
1087;
1088; ASSUMES the instruction does not support memory destination.
1089;
1090; @param 1 The instruction mnemonic.
1091; @param 2 The modified flags.
1092; @param 3 The undefined flags.
1093;
1094%macro IEMIMPL_BIT_OP3 3
1095BEGINCODE
1096BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1097 PROLOGUE_3_ARGS
1098 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1099 %1 T0_16, A1_16
1100 mov [A0], T0_16
1101 IEM_SAVE_FLAGS A2, %2, %3
1102 EPILOGUE_3_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u16
1104
1105BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1106 PROLOGUE_3_ARGS
1107 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1108 %1 T0_32, A1_32
1109 mov [A0], T0_32
1110 IEM_SAVE_FLAGS A2, %2, %3
1111 EPILOGUE_3_ARGS
1112ENDPROC iemAImpl_ %+ %1 %+ _u32
1113
1114 %ifdef RT_ARCH_AMD64
1115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1116 PROLOGUE_3_ARGS
1117 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1118 %1 T0, A1
1119 mov [A0], T0
1120 IEM_SAVE_FLAGS A2, %2, %3
1121 EPILOGUE_3_ARGS_EX 8
1122ENDPROC iemAImpl_ %+ %1 %+ _u64
1123 %endif ; RT_ARCH_AMD64
1124%endmacro
1125IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1126
1127
1128;
1129; IMUL is also a similar but yet different case (no lock, no mem dst).
1130; The rDX:rAX variant of imul is handled together with mul further down.
1131;
1132BEGINCODE
1133; @param 1 EFLAGS that are modified.
1134; @param 2 Undefined EFLAGS.
1135; @param 3 Function suffix.
1136; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1137; 2 for AMD (set AF, clear PF, ZF and SF).
1138%macro IEMIMPL_IMUL_TWO 4
1139BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1140 PROLOGUE_3_ARGS
1141 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1142 imul A1_16, word [A0]
1143 mov [A0], A1_16
1144 %if %4 != 1
1145 IEM_SAVE_FLAGS A2, %1, %2
1146 %else
1147 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1148 %endif
1149 EPILOGUE_3_ARGS
1150ENDPROC iemAImpl_imul_two_u16 %+ %3
1151
1152BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1153 PROLOGUE_3_ARGS
1154 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1155 imul A1_32, dword [A0]
1156 mov [A0], A1_32
1157 %if %4 != 1
1158 IEM_SAVE_FLAGS A2, %1, %2
1159 %else
1160 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1161 %endif
1162 EPILOGUE_3_ARGS
1163ENDPROC iemAImpl_imul_two_u32 %+ %3
1164
1165 %ifdef RT_ARCH_AMD64
1166BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1167 PROLOGUE_3_ARGS
1168 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1169 imul A1, qword [A0]
1170 mov [A0], A1
1171 %if %4 != 1
1172 IEM_SAVE_FLAGS A2, %1, %2
1173 %else
1174 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1175 %endif
1176 EPILOGUE_3_ARGS_EX 8
1177ENDPROC iemAImpl_imul_two_u64 %+ %3
1178 %endif ; RT_ARCH_AMD64
1179%endmacro
1180IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1181IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1182IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1183
1184
1185;
1186; XCHG for memory operands. This implies locking. No flag changes.
1187;
1188; Each function takes two arguments, first the pointer to the memory,
1189; then the pointer to the register. They all return void.
1190;
1191BEGINCODE
1192BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1193 PROLOGUE_2_ARGS
1194 mov T0_8, [A1]
1195 xchg [A0], T0_8
1196 mov [A1], T0_8
1197 EPILOGUE_2_ARGS
1198ENDPROC iemAImpl_xchg_u8_locked
1199
1200BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_16, [A1]
1203 xchg [A0], T0_16
1204 mov [A1], T0_16
1205 EPILOGUE_2_ARGS
1206ENDPROC iemAImpl_xchg_u16_locked
1207
1208BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1209 PROLOGUE_2_ARGS
1210 mov T0_32, [A1]
1211 xchg [A0], T0_32
1212 mov [A1], T0_32
1213 EPILOGUE_2_ARGS
1214ENDPROC iemAImpl_xchg_u32_locked
1215
1216%ifdef RT_ARCH_AMD64
1217BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0, [A1]
1220 xchg [A0], T0
1221 mov [A1], T0
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u64_locked
1224%endif
1225
1226; Unlocked variants for fDisregardLock mode.
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_8, [A1]
1231 mov T1_8, [A0]
1232 mov [A0], T0_8
1233 mov [A1], T1_8
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u8_unlocked
1236
1237BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1238 PROLOGUE_2_ARGS
1239 mov T0_16, [A1]
1240 mov T1_16, [A0]
1241 mov [A0], T0_16
1242 mov [A1], T1_16
1243 EPILOGUE_2_ARGS
1244ENDPROC iemAImpl_xchg_u16_unlocked
1245
1246BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1247 PROLOGUE_2_ARGS
1248 mov T0_32, [A1]
1249 mov T1_32, [A0]
1250 mov [A0], T0_32
1251 mov [A1], T1_32
1252 EPILOGUE_2_ARGS
1253ENDPROC iemAImpl_xchg_u32_unlocked
1254
1255%ifdef RT_ARCH_AMD64
1256BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1257 PROLOGUE_2_ARGS
1258 mov T0, [A1]
1259 mov T1, [A0]
1260 mov [A0], T0
1261 mov [A1], T1
1262 EPILOGUE_2_ARGS
1263ENDPROC iemAImpl_xchg_u64_unlocked
1264%endif
1265
1266
1267;
1268; XADD for memory operands.
1269;
1270; Each function takes three arguments, first the pointer to the
1271; memory/register, then the pointer to the register, and finally a pointer to
1272; eflags. They all return void.
1273;
1274BEGINCODE
1275BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1276 PROLOGUE_3_ARGS
1277 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1278 mov T0_8, [A1]
1279 xadd [A0], T0_8
1280 mov [A1], T0_8
1281 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1282 EPILOGUE_3_ARGS
1283ENDPROC iemAImpl_xadd_u8
1284
1285BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1286 PROLOGUE_3_ARGS
1287 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1288 mov T0_16, [A1]
1289 xadd [A0], T0_16
1290 mov [A1], T0_16
1291 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1292 EPILOGUE_3_ARGS
1293ENDPROC iemAImpl_xadd_u16
1294
1295BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1296 PROLOGUE_3_ARGS
1297 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1298 mov T0_32, [A1]
1299 xadd [A0], T0_32
1300 mov [A1], T0_32
1301 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 EPILOGUE_3_ARGS
1303ENDPROC iemAImpl_xadd_u32
1304
1305%ifdef RT_ARCH_AMD64
1306BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 mov T0, [A1]
1310 xadd [A0], T0
1311 mov [A1], T0
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u64
1315%endif ; RT_ARCH_AMD64
1316
1317BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1320 mov T0_8, [A1]
1321 lock xadd [A0], T0_8
1322 mov [A1], T0_8
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u8_locked
1326
1327BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1328 PROLOGUE_3_ARGS
1329 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 mov T0_16, [A1]
1331 lock xadd [A0], T0_16
1332 mov [A1], T0_16
1333 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1334 EPILOGUE_3_ARGS
1335ENDPROC iemAImpl_xadd_u16_locked
1336
1337BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1338 PROLOGUE_3_ARGS
1339 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1340 mov T0_32, [A1]
1341 lock xadd [A0], T0_32
1342 mov [A1], T0_32
1343 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1344 EPILOGUE_3_ARGS
1345ENDPROC iemAImpl_xadd_u32_locked
1346
1347%ifdef RT_ARCH_AMD64
1348BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 mov T0, [A1]
1352 lock xadd [A0], T0
1353 mov [A1], T0
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u64_locked
1357%endif ; RT_ARCH_AMD64
1358
1359
1360;
1361; CMPXCHG8B.
1362;
1363; These are tricky register wise, so the code is duplicated for each calling
1364; convention.
1365;
1366; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1367;
1368; C-proto:
1369; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1370; uint32_t *pEFlags));
1371;
1372; Note! Identical to iemAImpl_cmpxchg16b.
1373;
1374BEGINCODE
1375BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1376%ifdef RT_ARCH_AMD64
1377 %ifdef ASM_CALL64_MSC
1378 push rbx
1379
1380 mov r11, rdx ; pu64EaxEdx (is also T1)
1381 mov r10, rcx ; pu64Dst
1382
1383 mov ebx, [r8]
1384 mov ecx, [r8 + 4]
1385 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1386 mov eax, [r11]
1387 mov edx, [r11 + 4]
1388
1389 lock cmpxchg8b [r10]
1390
1391 mov [r11], eax
1392 mov [r11 + 4], edx
1393 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1394
1395 pop rbx
1396 ret
1397 %else
1398 push rbx
1399
1400 mov r10, rcx ; pEFlags
1401 mov r11, rdx ; pu64EbxEcx (is also T1)
1402
1403 mov ebx, [r11]
1404 mov ecx, [r11 + 4]
1405 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1406 mov eax, [rsi]
1407 mov edx, [rsi + 4]
1408
1409 lock cmpxchg8b [rdi]
1410
1411 mov [rsi], eax
1412 mov [rsi + 4], edx
1413 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1414
1415 pop rbx
1416 ret
1417
1418 %endif
1419%else
1420 push esi
1421 push edi
1422 push ebx
1423 push ebp
1424
1425 mov edi, ecx ; pu64Dst
1426 mov esi, edx ; pu64EaxEdx
1427 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1428 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1429
1430 mov ebx, [ecx]
1431 mov ecx, [ecx + 4]
1432 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1433 mov eax, [esi]
1434 mov edx, [esi + 4]
1435
1436 lock cmpxchg8b [edi]
1437
1438 mov [esi], eax
1439 mov [esi + 4], edx
1440 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1441
1442 pop ebp
1443 pop ebx
1444 pop edi
1445 pop esi
1446 ret 8
1447%endif
1448ENDPROC iemAImpl_cmpxchg8b
1449
1450BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1451 ; Lazy bird always lock prefixes cmpxchg8b.
1452 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1453ENDPROC iemAImpl_cmpxchg8b_locked
1454
1455%ifdef RT_ARCH_AMD64
1456
1457;
1458; CMPXCHG16B.
1459;
1460; These are tricky register wise, so the code is duplicated for each calling
1461; convention.
1462;
1463; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1464;
1465; C-proto:
1466; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1467; uint32_t *pEFlags));
1468;
1469; Note! Identical to iemAImpl_cmpxchg8b.
1470;
1471BEGINCODE
1472BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1473 %ifdef ASM_CALL64_MSC
1474 push rbx
1475
1476 mov r11, rdx ; pu64RaxRdx (is also T1)
1477 mov r10, rcx ; pu64Dst
1478
1479 mov rbx, [r8]
1480 mov rcx, [r8 + 8]
1481 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1482 mov rax, [r11]
1483 mov rdx, [r11 + 8]
1484
1485 lock cmpxchg16b [r10]
1486
1487 mov [r11], rax
1488 mov [r11 + 8], rdx
1489 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1490
1491 pop rbx
1492 ret
1493 %else
1494 push rbx
1495
1496 mov r10, rcx ; pEFlags
1497 mov r11, rdx ; pu64RbxRcx (is also T1)
1498
1499 mov rbx, [r11]
1500 mov rcx, [r11 + 8]
1501 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1502 mov rax, [rsi]
1503 mov rdx, [rsi + 8]
1504
1505 lock cmpxchg16b [rdi]
1506
1507 mov [rsi], rax
1508 mov [rsi + 8], rdx
1509 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1510
1511 pop rbx
1512 ret
1513
1514 %endif
1515ENDPROC iemAImpl_cmpxchg16b
1516
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1518 ; Lazy bird always lock prefixes cmpxchg16b.
1519 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1520ENDPROC iemAImpl_cmpxchg16b_locked
1521
1522%endif ; RT_ARCH_AMD64
1523
1524
1525;
1526; CMPXCHG.
1527;
1528; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1529;
1530; C-proto:
1531; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1532;
1533BEGINCODE
1534%macro IEMIMPL_CMPXCHG 2
1535BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1536 PROLOGUE_4_ARGS
1537 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1538 mov al, [A1]
1539 %1 cmpxchg [A0], A2_8
1540 mov [A1], al
1541 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1542 EPILOGUE_4_ARGS
1543ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1544
1545BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1546 PROLOGUE_4_ARGS
1547 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1548 mov ax, [A1]
1549 %1 cmpxchg [A0], A2_16
1550 mov [A1], ax
1551 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1552 EPILOGUE_4_ARGS
1553ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1554
1555BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1556 PROLOGUE_4_ARGS
1557 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1558 mov eax, [A1]
1559 %1 cmpxchg [A0], A2_32
1560 mov [A1], eax
1561 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1562 EPILOGUE_4_ARGS
1563ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1564
1565BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1566%ifdef RT_ARCH_AMD64
1567 PROLOGUE_4_ARGS
1568 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1569 mov rax, [A1]
1570 %1 cmpxchg [A0], A2
1571 mov [A1], rax
1572 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1573 EPILOGUE_4_ARGS
1574%else
1575 ;
1576 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1577 ;
1578 push esi
1579 push edi
1580 push ebx
1581 push ebp
1582
1583 mov edi, ecx ; pu64Dst
1584 mov esi, edx ; pu64Rax
1585 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1586 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1587
1588 mov ebx, [ecx]
1589 mov ecx, [ecx + 4]
1590 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1591 mov eax, [esi]
1592 mov edx, [esi + 4]
1593
1594 lock cmpxchg8b [edi]
1595
1596 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1597 jz .cmpxchg8b_not_equal
1598 cmp eax, eax ; just set the other flags.
1599.store:
1600 mov [esi], eax
1601 mov [esi + 4], edx
1602 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1603
1604 pop ebp
1605 pop ebx
1606 pop edi
1607 pop esi
1608 ret 8
1609
1610.cmpxchg8b_not_equal:
1611 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1612 jne .store
1613 cmp [esi], eax
1614 jmp .store
1615
1616%endif
1617ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1618%endmacro ; IEMIMPL_CMPXCHG
1619
1620IEMIMPL_CMPXCHG , ,
1621IEMIMPL_CMPXCHG lock, _locked
1622
1623;;
1624; Macro for implementing a unary operator.
1625;
1626; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1627; variants, except on 32-bit system where the 64-bit accesses requires hand
1628; coding.
1629;
1630; All the functions takes a pointer to the destination memory operand in A0,
1631; the source register operand in A1 and a pointer to eflags in A2.
1632;
1633; @param 1 The instruction mnemonic.
1634; @param 2 The modified flags.
1635; @param 3 The undefined flags.
1636;
1637%macro IEMIMPL_UNARY_OP 3
1638BEGINCODE
1639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1640 PROLOGUE_2_ARGS
1641 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1642 %1 byte [A0]
1643 IEM_SAVE_FLAGS A1, %2, %3
1644 EPILOGUE_2_ARGS
1645ENDPROC iemAImpl_ %+ %1 %+ _u8
1646
1647BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1648 PROLOGUE_2_ARGS
1649 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1650 lock %1 byte [A0]
1651 IEM_SAVE_FLAGS A1, %2, %3
1652 EPILOGUE_2_ARGS
1653ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1654
1655BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1656 PROLOGUE_2_ARGS
1657 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1658 %1 word [A0]
1659 IEM_SAVE_FLAGS A1, %2, %3
1660 EPILOGUE_2_ARGS
1661ENDPROC iemAImpl_ %+ %1 %+ _u16
1662
1663BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1664 PROLOGUE_2_ARGS
1665 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1666 lock %1 word [A0]
1667 IEM_SAVE_FLAGS A1, %2, %3
1668 EPILOGUE_2_ARGS
1669ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1670
1671BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1672 PROLOGUE_2_ARGS
1673 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1674 %1 dword [A0]
1675 IEM_SAVE_FLAGS A1, %2, %3
1676 EPILOGUE_2_ARGS
1677ENDPROC iemAImpl_ %+ %1 %+ _u32
1678
1679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1680 PROLOGUE_2_ARGS
1681 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1682 lock %1 dword [A0]
1683 IEM_SAVE_FLAGS A1, %2, %3
1684 EPILOGUE_2_ARGS
1685ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1686
1687 %ifdef RT_ARCH_AMD64
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 qword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u64
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 qword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1703 %endif ; RT_ARCH_AMD64
1704
1705%endmacro
1706
1707IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1708IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1709IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1710IEMIMPL_UNARY_OP not, 0, 0
1711
1712
1713;
1714; BSWAP. No flag changes.
1715;
1716; Each function takes one argument, pointer to the value to bswap
1717; (input/output). They all return void.
1718;
1719BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1720 PROLOGUE_1_ARGS
1721 mov T0_32, [A0] ; just in case any of the upper bits are used.
1722 db 66h
1723 bswap T0_32
1724 mov [A0], T0_32
1725 EPILOGUE_1_ARGS
1726ENDPROC iemAImpl_bswap_u16
1727
1728BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1729 PROLOGUE_1_ARGS
1730 mov T0_32, [A0]
1731 bswap T0_32
1732 mov [A0], T0_32
1733 EPILOGUE_1_ARGS
1734ENDPROC iemAImpl_bswap_u32
1735
1736BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1737%ifdef RT_ARCH_AMD64
1738 PROLOGUE_1_ARGS
1739 mov T0, [A0]
1740 bswap T0
1741 mov [A0], T0
1742 EPILOGUE_1_ARGS
1743%else
1744 PROLOGUE_1_ARGS
1745 mov T0, [A0]
1746 mov T1, [A0 + 4]
1747 bswap T0
1748 bswap T1
1749 mov [A0 + 4], T0
1750 mov [A0], T1
1751 EPILOGUE_1_ARGS
1752%endif
1753ENDPROC iemAImpl_bswap_u64
1754
1755
1756;;
1757; Macro for implementing a shift operation.
1758;
1759; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1760; 32-bit system where the 64-bit accesses requires hand coding.
1761;
1762; All the functions takes a pointer to the destination memory operand in A0,
1763; the shift count in A1 and a pointer to eflags in A2.
1764;
1765; @param 1 The instruction mnemonic.
1766; @param 2 The modified flags.
1767; @param 3 The undefined flags.
1768;
1769; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1770;
1771; @note the _intel and _amd variants are implemented in C.
1772;
1773%macro IEMIMPL_SHIFT_OP 3
1774BEGINCODE
1775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1778 %ifdef ASM_CALL64_GCC
1779 mov cl, A1_8
1780 %1 byte [A0], cl
1781 %else
1782 xchg A1, A0
1783 %1 byte [A1], cl
1784 %endif
1785 IEM_SAVE_FLAGS A2, %2, %3
1786 EPILOGUE_3_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u8
1788
1789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1790 PROLOGUE_3_ARGS
1791 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1792 %ifdef ASM_CALL64_GCC
1793 mov cl, A1_8
1794 %1 word [A0], cl
1795 %else
1796 xchg A1, A0
1797 %1 word [A1], cl
1798 %endif
1799 IEM_SAVE_FLAGS A2, %2, %3
1800 EPILOGUE_3_ARGS
1801ENDPROC iemAImpl_ %+ %1 %+ _u16
1802
1803BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1804 PROLOGUE_3_ARGS
1805 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1806 %ifdef ASM_CALL64_GCC
1807 mov cl, A1_8
1808 %1 dword [A0], cl
1809 %else
1810 xchg A1, A0
1811 %1 dword [A1], cl
1812 %endif
1813 IEM_SAVE_FLAGS A2, %2, %3
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_ %+ %1 %+ _u32
1816
1817 %ifdef RT_ARCH_AMD64
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1819 PROLOGUE_3_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1821 %ifdef ASM_CALL64_GCC
1822 mov cl, A1_8
1823 %1 qword [A0], cl
1824 %else
1825 xchg A1, A0
1826 %1 qword [A1], cl
1827 %endif
1828 IEM_SAVE_FLAGS A2, %2, %3
1829 EPILOGUE_3_ARGS
1830ENDPROC iemAImpl_ %+ %1 %+ _u64
1831 %endif ; RT_ARCH_AMD64
1832
1833%endmacro
1834
1835IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1836IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1837IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1838IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1839IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1840IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1841IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1842
1843
1844;;
1845; Macro for implementing a double precision shift operation.
1846;
1847; This will generate code for the 16, 32 and 64 bit accesses, except on
1848; 32-bit system where the 64-bit accesses requires hand coding.
1849;
1850; The functions takes the destination operand (r/m) in A0, the source (reg) in
1851; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1852;
1853; @param 1 The instruction mnemonic.
1854; @param 2 The modified flags.
1855; @param 3 The undefined flags.
1856;
1857; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1858;
1859; @note the _intel and _amd variants are implemented in C.
1860;
1861%macro IEMIMPL_SHIFT_DBL_OP 3
1862BEGINCODE
1863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1864 PROLOGUE_4_ARGS
1865 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1866 %ifdef ASM_CALL64_GCC
1867 xchg A3, A2
1868 %1 [A0], A1_16, cl
1869 xchg A3, A2
1870 %else
1871 xchg A0, A2
1872 %1 [A2], A1_16, cl
1873 %endif
1874 IEM_SAVE_FLAGS A3, %2, %3
1875 EPILOGUE_4_ARGS
1876ENDPROC iemAImpl_ %+ %1 %+ _u16
1877
1878BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1879 PROLOGUE_4_ARGS
1880 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1881 %ifdef ASM_CALL64_GCC
1882 xchg A3, A2
1883 %1 [A0], A1_32, cl
1884 xchg A3, A2
1885 %else
1886 xchg A0, A2
1887 %1 [A2], A1_32, cl
1888 %endif
1889 IEM_SAVE_FLAGS A3, %2, %3
1890 EPILOGUE_4_ARGS
1891ENDPROC iemAImpl_ %+ %1 %+ _u32
1892
1893 %ifdef RT_ARCH_AMD64
1894BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1895 PROLOGUE_4_ARGS
1896 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1897 %ifdef ASM_CALL64_GCC
1898 xchg A3, A2
1899 %1 [A0], A1, cl
1900 xchg A3, A2
1901 %else
1902 xchg A0, A2
1903 %1 [A2], A1, cl
1904 %endif
1905 IEM_SAVE_FLAGS A3, %2, %3
1906 EPILOGUE_4_ARGS_EX 12
1907ENDPROC iemAImpl_ %+ %1 %+ _u64
1908 %endif ; RT_ARCH_AMD64
1909
1910%endmacro
1911
1912IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1913IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1914
1915
1916;;
1917; Macro for implementing a multiplication operations.
1918;
1919; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1920; 32-bit system where the 64-bit accesses requires hand coding.
1921;
1922; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1923; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1924; pointer to eflags in A3.
1925;
1926; The functions all return 0 so the caller can be used for div/idiv as well as
1927; for the mul/imul implementation.
1928;
1929; @param 1 The instruction mnemonic.
1930; @param 2 The modified flags.
1931; @param 3 The undefined flags.
1932; @param 4 Name suffix.
1933; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1934;
1935; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1936;
1937%macro IEMIMPL_MUL_OP 5
1938BEGINCODE
1939BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1940 PROLOGUE_3_ARGS
1941 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1942 mov al, [A0]
1943 %1 A1_8
1944 mov [A0], ax
1945 %if %5 != 1
1946 IEM_SAVE_FLAGS A2, %2, %3
1947 %else
1948 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1949 %endif
1950 xor eax, eax
1951 EPILOGUE_3_ARGS
1952ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1953
1954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1955 PROLOGUE_4_ARGS
1956 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1957 mov ax, [A0]
1958 %ifdef ASM_CALL64_GCC
1959 %1 A2_16
1960 mov [A0], ax
1961 mov [A1], dx
1962 %else
1963 mov T1, A1
1964 %1 A2_16
1965 mov [A0], ax
1966 mov [T1], dx
1967 %endif
1968 %if %5 != 1
1969 IEM_SAVE_FLAGS A3, %2, %3
1970 %else
1971 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1972 %endif
1973 xor eax, eax
1974 EPILOGUE_4_ARGS
1975ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1976
1977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1978 PROLOGUE_4_ARGS
1979 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1980 mov eax, [A0]
1981 %ifdef ASM_CALL64_GCC
1982 %1 A2_32
1983 mov [A0], eax
1984 mov [A1], edx
1985 %else
1986 mov T1, A1
1987 %1 A2_32
1988 mov [A0], eax
1989 mov [T1], edx
1990 %endif
1991 %if %5 != 1
1992 IEM_SAVE_FLAGS A3, %2, %3
1993 %else
1994 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1995 %endif
1996 xor eax, eax
1997 EPILOGUE_4_ARGS
1998ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1999
2000 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2001BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2002 PROLOGUE_4_ARGS
2003 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2004 mov rax, [A0]
2005 %ifdef ASM_CALL64_GCC
2006 %1 A2
2007 mov [A0], rax
2008 mov [A1], rdx
2009 %else
2010 mov T1, A1
2011 %1 A2
2012 mov [A0], rax
2013 mov [T1], rdx
2014 %endif
2015 %if %5 != 1
2016 IEM_SAVE_FLAGS A3, %2, %3
2017 %else
2018 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2019 %endif
2020 xor eax, eax
2021 EPILOGUE_4_ARGS_EX 12
2022ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2023 %endif ; !RT_ARCH_AMD64
2024
2025%endmacro
2026
2027IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2028IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2029IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2030IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2031IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2032IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2033
2034
2035BEGINCODE
2036;;
2037; Worker function for negating a 32-bit number in T1:T0
2038; @uses None (T0,T1)
2039BEGINPROC iemAImpl_negate_T0_T1_u32
2040 push 0
2041 push 0
2042 xchg T0_32, [xSP]
2043 xchg T1_32, [xSP + xCB]
2044 sub T0_32, [xSP]
2045 sbb T1_32, [xSP + xCB]
2046 add xSP, xCB*2
2047 ret
2048ENDPROC iemAImpl_negate_T0_T1_u32
2049
2050%ifdef RT_ARCH_AMD64
2051;;
2052; Worker function for negating a 64-bit number in T1:T0
2053; @uses None (T0,T1)
2054BEGINPROC iemAImpl_negate_T0_T1_u64
2055 push 0
2056 push 0
2057 xchg T0, [xSP]
2058 xchg T1, [xSP + xCB]
2059 sub T0, [xSP]
2060 sbb T1, [xSP + xCB]
2061 add xSP, xCB*2
2062 ret
2063ENDPROC iemAImpl_negate_T0_T1_u64
2064%endif
2065
2066
2067;;
2068; Macro for implementing a division operations.
2069;
2070; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2071; 32-bit system where the 64-bit accesses requires hand coding.
2072;
2073; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2074; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2075; pointer to eflags in A3.
2076;
2077; The functions all return 0 on success and -1 if a divide error should be
2078; raised by the caller.
2079;
2080; @param 1 The instruction mnemonic.
2081; @param 2 The modified flags.
2082; @param 3 The undefined flags.
2083; @param 4 1 if signed, 0 if unsigned.
2084; @param 5 Function suffix.
2085; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2086; 2 for AMD (set AF, clear PF, ZF and SF).
2087;
2088; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2089;
2090%macro IEMIMPL_DIV_OP 6
2091BEGINCODE
2092BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2093 PROLOGUE_3_ARGS
2094
2095 ; div by chainsaw check.
2096 test A1_8, A1_8
2097 jz .div_zero
2098
2099 ; Overflow check - unsigned division is simple to verify, haven't
2100 ; found a simple way to check signed division yet unfortunately.
2101 %if %4 == 0
2102 cmp [A0 + 1], A1_8
2103 jae .div_overflow
2104 %else
2105 mov T0_16, [A0] ; T0 = dividend
2106 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2107 test A1_8, A1_8
2108 js .divisor_negative
2109 test T0_16, T0_16
2110 jns .both_positive
2111 neg T0_16
2112.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2113 push T0 ; Start off like unsigned below.
2114 shr T0_16, 7
2115 cmp T0_8, A1_8
2116 pop T0
2117 jb .div_no_overflow
2118 ja .div_overflow
2119 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2120 cmp T0_8, A1_8
2121 jae .div_overflow
2122 jmp .div_no_overflow
2123
2124.divisor_negative:
2125 neg A1_8
2126 test T0_16, T0_16
2127 jns .one_of_each
2128 neg T0_16
2129.both_positive: ; Same as unsigned shifted by sign indicator bit.
2130 shr T0_16, 7
2131 cmp T0_8, A1_8
2132 jae .div_overflow
2133.div_no_overflow:
2134 mov A1, T1 ; restore divisor
2135 %endif
2136
2137 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2138 mov ax, [A0]
2139 %1 A1_8
2140 mov [A0], ax
2141 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2142 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2143 %else
2144 IEM_SAVE_FLAGS A2, %2, %3
2145 %endif
2146 xor eax, eax
2147
2148.return:
2149 EPILOGUE_3_ARGS
2150
2151.div_zero:
2152.div_overflow:
2153 mov eax, -1
2154 jmp .return
2155ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2156
2157BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2158 PROLOGUE_4_ARGS
2159
2160 ; div by chainsaw check.
2161 test A2_16, A2_16
2162 jz .div_zero
2163
2164 ; Overflow check - unsigned division is simple to verify, haven't
2165 ; found a simple way to check signed division yet unfortunately.
2166 %if %4 == 0
2167 cmp [A1], A2_16
2168 jae .div_overflow
2169 %else
2170 mov T0_16, [A1]
2171 shl T0_32, 16
2172 mov T0_16, [A0] ; T0 = dividend
2173 mov T1, A2 ; T1 = divisor
2174 test T1_16, T1_16
2175 js .divisor_negative
2176 test T0_32, T0_32
2177 jns .both_positive
2178 neg T0_32
2179.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2180 push T0 ; Start off like unsigned below.
2181 shr T0_32, 15
2182 cmp T0_16, T1_16
2183 pop T0
2184 jb .div_no_overflow
2185 ja .div_overflow
2186 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2187 cmp T0_16, T1_16
2188 jae .div_overflow
2189 jmp .div_no_overflow
2190
2191.divisor_negative:
2192 neg T1_16
2193 test T0_32, T0_32
2194 jns .one_of_each
2195 neg T0_32
2196.both_positive: ; Same as unsigned shifted by sign indicator bit.
2197 shr T0_32, 15
2198 cmp T0_16, T1_16
2199 jae .div_overflow
2200.div_no_overflow:
2201 %endif
2202
2203 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2204 %ifdef ASM_CALL64_GCC
2205 mov T1, A2
2206 mov ax, [A0]
2207 mov dx, [A1]
2208 %1 T1_16
2209 mov [A0], ax
2210 mov [A1], dx
2211 %else
2212 mov T1, A1
2213 mov ax, [A0]
2214 mov dx, [T1]
2215 %1 A2_16
2216 mov [A0], ax
2217 mov [T1], dx
2218 %endif
2219 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2220 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2221 %else
2222 IEM_SAVE_FLAGS A3, %2, %3
2223 %endif
2224 xor eax, eax
2225
2226.return:
2227 EPILOGUE_4_ARGS
2228
2229.div_zero:
2230.div_overflow:
2231 mov eax, -1
2232 jmp .return
2233ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2234
2235BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2236 PROLOGUE_4_ARGS
2237
2238 ; div by chainsaw check.
2239 test A2_32, A2_32
2240 jz .div_zero
2241
2242 ; Overflow check - unsigned division is simple to verify, haven't
2243 ; found a simple way to check signed division yet unfortunately.
2244 %if %4 == 0
2245 cmp [A1], A2_32
2246 jae .div_overflow
2247 %else
2248 push A2 ; save A2 so we modify it (we out of regs on x86).
2249 mov T0_32, [A0] ; T0 = dividend low
2250 mov T1_32, [A1] ; T1 = dividend high
2251 test A2_32, A2_32
2252 js .divisor_negative
2253 test T1_32, T1_32
2254 jns .both_positive
2255 call NAME(iemAImpl_negate_T0_T1_u32)
2256.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2257 push T0 ; Start off like unsigned below.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 pop T0
2263 jb .div_no_overflow
2264 ja .div_overflow
2265 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2266 cmp T0_32, A2_32
2267 jae .div_overflow
2268 jmp .div_no_overflow
2269
2270.divisor_negative:
2271 neg A2_32
2272 test T1_32, T1_32
2273 jns .one_of_each
2274 call NAME(iemAImpl_negate_T0_T1_u32)
2275.both_positive: ; Same as unsigned shifted by sign indicator bit.
2276 shl T1_32, 1
2277 shr T0_32, 31
2278 or T1_32, T0_32
2279 cmp T1_32, A2_32
2280 jae .div_overflow
2281.div_no_overflow:
2282 pop A2
2283 %endif
2284
2285 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2286 mov eax, [A0]
2287 %ifdef ASM_CALL64_GCC
2288 mov T1, A2
2289 mov eax, [A0]
2290 mov edx, [A1]
2291 %1 T1_32
2292 mov [A0], eax
2293 mov [A1], edx
2294 %else
2295 mov T1, A1
2296 mov eax, [A0]
2297 mov edx, [T1]
2298 %1 A2_32
2299 mov [A0], eax
2300 mov [T1], edx
2301 %endif
2302 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2303 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2304 %else
2305 IEM_SAVE_FLAGS A3, %2, %3
2306 %endif
2307 xor eax, eax
2308
2309.return:
2310 EPILOGUE_4_ARGS
2311
2312.div_overflow:
2313 %if %4 != 0
2314 pop A2
2315 %endif
2316.div_zero:
2317 mov eax, -1
2318 jmp .return
2319ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2320
2321 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2322BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2323 PROLOGUE_4_ARGS
2324
2325 test A2, A2
2326 jz .div_zero
2327 %if %4 == 0
2328 cmp [A1], A2
2329 jae .div_overflow
2330 %else
2331 push A2 ; save A2 so we modify it (we out of regs on x86).
2332 mov T0, [A0] ; T0 = dividend low
2333 mov T1, [A1] ; T1 = dividend high
2334 test A2, A2
2335 js .divisor_negative
2336 test T1, T1
2337 jns .both_positive
2338 call NAME(iemAImpl_negate_T0_T1_u64)
2339.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2340 push T0 ; Start off like unsigned below.
2341 shl T1, 1
2342 shr T0, 63
2343 or T1, T0
2344 cmp T1, A2
2345 pop T0
2346 jb .div_no_overflow
2347 ja .div_overflow
2348 mov T1, 0x7fffffffffffffff
2349 and T0, T1 ; Special case for covering (divisor - 1).
2350 cmp T0, A2
2351 jae .div_overflow
2352 jmp .div_no_overflow
2353
2354.divisor_negative:
2355 neg A2
2356 test T1, T1
2357 jns .one_of_each
2358 call NAME(iemAImpl_negate_T0_T1_u64)
2359.both_positive: ; Same as unsigned shifted by sign indicator bit.
2360 shl T1, 1
2361 shr T0, 63
2362 or T1, T0
2363 cmp T1, A2
2364 jae .div_overflow
2365.div_no_overflow:
2366 pop A2
2367 %endif
2368
2369 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2370 mov rax, [A0]
2371 %ifdef ASM_CALL64_GCC
2372 mov T1, A2
2373 mov rax, [A0]
2374 mov rdx, [A1]
2375 %1 T1
2376 mov [A0], rax
2377 mov [A1], rdx
2378 %else
2379 mov T1, A1
2380 mov rax, [A0]
2381 mov rdx, [T1]
2382 %1 A2
2383 mov [A0], rax
2384 mov [T1], rdx
2385 %endif
2386 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2387 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2388 %else
2389 IEM_SAVE_FLAGS A3, %2, %3
2390 %endif
2391 xor eax, eax
2392
2393.return:
2394 EPILOGUE_4_ARGS_EX 12
2395
2396.div_overflow:
2397 %if %4 != 0
2398 pop A2
2399 %endif
2400.div_zero:
2401 mov eax, -1
2402 jmp .return
2403ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2404 %endif ; !RT_ARCH_AMD64
2405
2406%endmacro
2407
2408IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2409IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2410IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2411IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2412IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2413IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2414
2415
2416;;
2417; Macro for implementing memory fence operation.
2418;
2419; No return value, no operands or anything.
2420;
2421; @param 1 The instruction.
2422;
2423%macro IEMIMPL_MEM_FENCE 1
2424BEGINCODE
2425BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2426 %1
2427 ret
2428ENDPROC iemAImpl_ %+ %1
2429%endmacro
2430
2431IEMIMPL_MEM_FENCE lfence
2432IEMIMPL_MEM_FENCE sfence
2433IEMIMPL_MEM_FENCE mfence
2434
2435;;
2436; Alternative for non-SSE2 host.
2437;
2438BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2439 push xAX
2440 xchg xAX, [xSP]
2441 add xSP, xCB
2442 ret
2443ENDPROC iemAImpl_alt_mem_fence
2444
2445
2446;;
2447; Initialize the FPU for the actual instruction being emulated, this means
2448; loading parts of the guest's control word and status word.
2449;
2450; @uses 24 bytes of stack. T0, T1
2451; @param 1 Expression giving the address of the FXSTATE of the guest.
2452;
2453%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2454 fnstenv [xSP]
2455
2456 ; FCW - for exception, precision and rounding control.
2457 movzx T0, word [%1 + X86FXSTATE.FCW]
2458 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2459 mov [xSP + X86FSTENV32P.FCW], T0_16
2460
2461 ; FSW - for undefined C0, C1, C2, and C3.
2462 movzx T1, word [%1 + X86FXSTATE.FSW]
2463 and T1, X86_FSW_C_MASK
2464 movzx T0, word [xSP + X86FSTENV32P.FSW]
2465 and T0, X86_FSW_TOP_MASK
2466 or T0, T1
2467 mov [xSP + X86FSTENV32P.FSW], T0_16
2468
2469 fldenv [xSP]
2470%endmacro
2471
2472
2473;;
2474; Initialize the FPU for the actual instruction being emulated, this means
2475; loading parts of the guest's control word, status word, and update the
2476; tag word for the top register if it's empty.
2477;
2478; ASSUMES actual TOP=7
2479;
2480; @uses 24 bytes of stack. T0, T1
2481; @param 1 Expression giving the address of the FXSTATE of the guest.
2482;
2483%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2484 fnstenv [xSP]
2485
2486 ; FCW - for exception, precision and rounding control.
2487 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2488 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2489 mov [xSP + X86FSTENV32P.FCW], T0_16
2490
2491 ; FSW - for undefined C0, C1, C2, and C3.
2492 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2493 and T1_32, X86_FSW_C_MASK
2494 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2495 and T0_32, X86_FSW_TOP_MASK
2496 or T0_32, T1_32
2497 mov [xSP + X86FSTENV32P.FSW], T0_16
2498
2499 ; FTW - Only for ST0 (in/out).
2500 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2501 shr T1_32, X86_FSW_TOP_SHIFT
2502 and T1_32, X86_FSW_TOP_SMASK
2503 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2504 jc %%st0_not_empty
2505 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2506%%st0_not_empty:
2507
2508 fldenv [xSP]
2509%endmacro
2510
2511
2512;;
2513; Need to move this as well somewhere better?
2514;
2515struc IEMFPURESULT
2516 .r80Result resw 5
2517 .FSW resw 1
2518endstruc
2519
2520
2521;;
2522; Need to move this as well somewhere better?
2523;
2524struc IEMFPURESULTTWO
2525 .r80Result1 resw 5
2526 .FSW resw 1
2527 .r80Result2 resw 5
2528endstruc
2529
2530
2531;
2532;---------------------- 16-bit signed integer operations ----------------------
2533;
2534
2535
2536;;
2537; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2538;
2539; @param A0 FPU context (fxsave).
2540; @param A1 Pointer to a IEMFPURESULT for the output.
2541; @param A2 Pointer to the 16-bit floating point value to convert.
2542;
2543BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2544 PROLOGUE_3_ARGS
2545 sub xSP, 20h
2546
2547 fninit
2548 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2549 fild word [A2]
2550
2551 fnstsw word [A1 + IEMFPURESULT.FSW]
2552 fnclex
2553 fstp tword [A1 + IEMFPURESULT.r80Result]
2554
2555 fninit
2556 add xSP, 20h
2557 EPILOGUE_3_ARGS
2558ENDPROC iemAImpl_fild_r80_from_i16
2559
2560
2561;;
2562; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2563;
2564; @param A0 FPU context (fxsave).
2565; @param A1 Where to return the output FSW.
2566; @param A2 Where to store the 16-bit signed integer value.
2567; @param A3 Pointer to the 80-bit value.
2568;
2569BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2570 PROLOGUE_4_ARGS
2571 sub xSP, 20h
2572
2573 fninit
2574 fld tword [A3]
2575 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2576 fistp word [A2]
2577
2578 fnstsw word [A1]
2579
2580 fninit
2581 add xSP, 20h
2582 EPILOGUE_4_ARGS
2583ENDPROC iemAImpl_fist_r80_to_i16
2584
2585
2586;;
2587; Store a 80-bit floating point value (register) as a 16-bit signed integer
2588; (memory) with truncation.
2589;
2590; @param A0 FPU context (fxsave).
2591; @param A1 Where to return the output FSW.
2592; @param A2 Where to store the 16-bit signed integer value.
2593; @param A3 Pointer to the 80-bit value.
2594;
2595BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2596 PROLOGUE_4_ARGS
2597 sub xSP, 20h
2598
2599 fninit
2600 fld tword [A3]
2601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2602 fisttp word [A2]
2603
2604 fnstsw word [A1]
2605
2606 fninit
2607 add xSP, 20h
2608 EPILOGUE_4_ARGS
2609ENDPROC iemAImpl_fistt_r80_to_i16
2610
2611
2612;;
2613; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2614;
2615; @param 1 The instruction
2616;
2617; @param A0 FPU context (fxsave).
2618; @param A1 Pointer to a IEMFPURESULT for the output.
2619; @param A2 Pointer to the 80-bit value.
2620; @param A3 Pointer to the 16-bit value.
2621;
2622%macro IEMIMPL_FPU_R80_BY_I16 1
2623BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2624 PROLOGUE_4_ARGS
2625 sub xSP, 20h
2626
2627 fninit
2628 fld tword [A2]
2629 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2630 %1 word [A3]
2631
2632 fnstsw word [A1 + IEMFPURESULT.FSW]
2633 fnclex
2634 fstp tword [A1 + IEMFPURESULT.r80Result]
2635
2636 fninit
2637 add xSP, 20h
2638 EPILOGUE_4_ARGS
2639ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2640%endmacro
2641
2642IEMIMPL_FPU_R80_BY_I16 fiadd
2643IEMIMPL_FPU_R80_BY_I16 fimul
2644IEMIMPL_FPU_R80_BY_I16 fisub
2645IEMIMPL_FPU_R80_BY_I16 fisubr
2646IEMIMPL_FPU_R80_BY_I16 fidiv
2647IEMIMPL_FPU_R80_BY_I16 fidivr
2648
2649
2650;;
2651; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2652; only returning FSW.
2653;
2654; @param 1 The instruction
2655;
2656; @param A0 FPU context (fxsave).
2657; @param A1 Where to store the output FSW.
2658; @param A2 Pointer to the 80-bit value.
2659; @param A3 Pointer to the 64-bit value.
2660;
2661%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2663 PROLOGUE_4_ARGS
2664 sub xSP, 20h
2665
2666 fninit
2667 fld tword [A2]
2668 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2669 %1 word [A3]
2670
2671 fnstsw word [A1]
2672
2673 fninit
2674 add xSP, 20h
2675 EPILOGUE_4_ARGS
2676ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2677%endmacro
2678
2679IEMIMPL_FPU_R80_BY_I16_FSW ficom
2680
2681
2682
2683;
2684;---------------------- 32-bit signed integer operations ----------------------
2685;
2686
2687
2688;;
2689; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2690;
2691; @param A0 FPU context (fxsave).
2692; @param A1 Pointer to a IEMFPURESULT for the output.
2693; @param A2 Pointer to the 32-bit floating point value to convert.
2694;
2695BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2696 PROLOGUE_3_ARGS
2697 sub xSP, 20h
2698
2699 fninit
2700 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2701 fild dword [A2]
2702
2703 fnstsw word [A1 + IEMFPURESULT.FSW]
2704 fnclex
2705 fstp tword [A1 + IEMFPURESULT.r80Result]
2706
2707 fninit
2708 add xSP, 20h
2709 EPILOGUE_3_ARGS
2710ENDPROC iemAImpl_fild_r80_from_i32
2711
2712
2713;;
2714; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2715;
2716; @param A0 FPU context (fxsave).
2717; @param A1 Where to return the output FSW.
2718; @param A2 Where to store the 32-bit signed integer value.
2719; @param A3 Pointer to the 80-bit value.
2720;
2721BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2722 PROLOGUE_4_ARGS
2723 sub xSP, 20h
2724
2725 fninit
2726 fld tword [A3]
2727 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2728 fistp dword [A2]
2729
2730 fnstsw word [A1]
2731
2732 fninit
2733 add xSP, 20h
2734 EPILOGUE_4_ARGS
2735ENDPROC iemAImpl_fist_r80_to_i32
2736
2737
2738;;
2739; Store a 80-bit floating point value (register) as a 32-bit signed integer
2740; (memory) with truncation.
2741;
2742; @param A0 FPU context (fxsave).
2743; @param A1 Where to return the output FSW.
2744; @param A2 Where to store the 32-bit signed integer value.
2745; @param A3 Pointer to the 80-bit value.
2746;
2747BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2748 PROLOGUE_4_ARGS
2749 sub xSP, 20h
2750
2751 fninit
2752 fld tword [A3]
2753 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754 fisttp dword [A2]
2755
2756 fnstsw word [A1]
2757
2758 fninit
2759 add xSP, 20h
2760 EPILOGUE_4_ARGS
2761ENDPROC iemAImpl_fistt_r80_to_i32
2762
2763
2764;;
2765; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2766;
2767; @param 1 The instruction
2768;
2769; @param A0 FPU context (fxsave).
2770; @param A1 Pointer to a IEMFPURESULT for the output.
2771; @param A2 Pointer to the 80-bit value.
2772; @param A3 Pointer to the 32-bit value.
2773;
2774%macro IEMIMPL_FPU_R80_BY_I32 1
2775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2776 PROLOGUE_4_ARGS
2777 sub xSP, 20h
2778
2779 fninit
2780 fld tword [A2]
2781 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2782 %1 dword [A3]
2783
2784 fnstsw word [A1 + IEMFPURESULT.FSW]
2785 fnclex
2786 fstp tword [A1 + IEMFPURESULT.r80Result]
2787
2788 fninit
2789 add xSP, 20h
2790 EPILOGUE_4_ARGS
2791ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2792%endmacro
2793
2794IEMIMPL_FPU_R80_BY_I32 fiadd
2795IEMIMPL_FPU_R80_BY_I32 fimul
2796IEMIMPL_FPU_R80_BY_I32 fisub
2797IEMIMPL_FPU_R80_BY_I32 fisubr
2798IEMIMPL_FPU_R80_BY_I32 fidiv
2799IEMIMPL_FPU_R80_BY_I32 fidivr
2800
2801
2802;;
2803; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2804; only returning FSW.
2805;
2806; @param 1 The instruction
2807;
2808; @param A0 FPU context (fxsave).
2809; @param A1 Where to store the output FSW.
2810; @param A2 Pointer to the 80-bit value.
2811; @param A3 Pointer to the 64-bit value.
2812;
2813%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2814BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2815 PROLOGUE_4_ARGS
2816 sub xSP, 20h
2817
2818 fninit
2819 fld tword [A2]
2820 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2821 %1 dword [A3]
2822
2823 fnstsw word [A1]
2824
2825 fninit
2826 add xSP, 20h
2827 EPILOGUE_4_ARGS
2828ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2829%endmacro
2830
2831IEMIMPL_FPU_R80_BY_I32_FSW ficom
2832
2833
2834
2835;
2836;---------------------- 64-bit signed integer operations ----------------------
2837;
2838
2839
2840;;
2841; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2842;
2843; @param A0 FPU context (fxsave).
2844; @param A1 Pointer to a IEMFPURESULT for the output.
2845; @param A2 Pointer to the 64-bit floating point value to convert.
2846;
2847BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2848 PROLOGUE_3_ARGS
2849 sub xSP, 20h
2850
2851 fninit
2852 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2853 fild qword [A2]
2854
2855 fnstsw word [A1 + IEMFPURESULT.FSW]
2856 fnclex
2857 fstp tword [A1 + IEMFPURESULT.r80Result]
2858
2859 fninit
2860 add xSP, 20h
2861 EPILOGUE_3_ARGS
2862ENDPROC iemAImpl_fild_r80_from_i64
2863
2864
2865;;
2866; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2867;
2868; @param A0 FPU context (fxsave).
2869; @param A1 Where to return the output FSW.
2870; @param A2 Where to store the 64-bit signed integer value.
2871; @param A3 Pointer to the 80-bit value.
2872;
2873BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2874 PROLOGUE_4_ARGS
2875 sub xSP, 20h
2876
2877 fninit
2878 fld tword [A3]
2879 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2880 fistp qword [A2]
2881
2882 fnstsw word [A1]
2883
2884 fninit
2885 add xSP, 20h
2886 EPILOGUE_4_ARGS
2887ENDPROC iemAImpl_fist_r80_to_i64
2888
2889
2890;;
2891; Store a 80-bit floating point value (register) as a 64-bit signed integer
2892; (memory) with truncation.
2893;
2894; @param A0 FPU context (fxsave).
2895; @param A1 Where to return the output FSW.
2896; @param A2 Where to store the 64-bit signed integer value.
2897; @param A3 Pointer to the 80-bit value.
2898;
2899BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2900 PROLOGUE_4_ARGS
2901 sub xSP, 20h
2902
2903 fninit
2904 fld tword [A3]
2905 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906 fisttp qword [A2]
2907
2908 fnstsw word [A1]
2909
2910 fninit
2911 add xSP, 20h
2912 EPILOGUE_4_ARGS
2913ENDPROC iemAImpl_fistt_r80_to_i64
2914
2915
2916
2917;
2918;---------------------- 32-bit floating point operations ----------------------
2919;
2920
2921;;
2922; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2923;
2924; @param A0 FPU context (fxsave).
2925; @param A1 Pointer to a IEMFPURESULT for the output.
2926; @param A2 Pointer to the 32-bit floating point value to convert.
2927;
2928BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2929 PROLOGUE_3_ARGS
2930 sub xSP, 20h
2931
2932 fninit
2933 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2934 fld dword [A2]
2935
2936 fnstsw word [A1 + IEMFPURESULT.FSW]
2937 fnclex
2938 fstp tword [A1 + IEMFPURESULT.r80Result]
2939
2940 fninit
2941 add xSP, 20h
2942 EPILOGUE_3_ARGS
2943ENDPROC iemAImpl_fld_r80_from_r32
2944
2945
2946;;
2947; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2948;
2949; @param A0 FPU context (fxsave).
2950; @param A1 Where to return the output FSW.
2951; @param A2 Where to store the 32-bit value.
2952; @param A3 Pointer to the 80-bit value.
2953;
2954BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2955 PROLOGUE_4_ARGS
2956 sub xSP, 20h
2957
2958 fninit
2959 fld tword [A3]
2960 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2961 fst dword [A2]
2962
2963 fnstsw word [A1]
2964
2965 fninit
2966 add xSP, 20h
2967 EPILOGUE_4_ARGS
2968ENDPROC iemAImpl_fst_r80_to_r32
2969
2970
2971;;
2972; FPU instruction working on one 80-bit and one 32-bit floating point value.
2973;
2974; @param 1 The instruction
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 80-bit value.
2979; @param A3 Pointer to the 32-bit value.
2980;
2981%macro IEMIMPL_FPU_R80_BY_R32 1
2982BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2983 PROLOGUE_4_ARGS
2984 sub xSP, 20h
2985
2986 fninit
2987 fld tword [A2]
2988 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2989 %1 dword [A3]
2990
2991 fnstsw word [A1 + IEMFPURESULT.FSW]
2992 fnclex
2993 fstp tword [A1 + IEMFPURESULT.r80Result]
2994
2995 fninit
2996 add xSP, 20h
2997 EPILOGUE_4_ARGS
2998ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2999%endmacro
3000
3001IEMIMPL_FPU_R80_BY_R32 fadd
3002IEMIMPL_FPU_R80_BY_R32 fmul
3003IEMIMPL_FPU_R80_BY_R32 fsub
3004IEMIMPL_FPU_R80_BY_R32 fsubr
3005IEMIMPL_FPU_R80_BY_R32 fdiv
3006IEMIMPL_FPU_R80_BY_R32 fdivr
3007
3008
3009;;
3010; FPU instruction working on one 80-bit and one 32-bit floating point value,
3011; only returning FSW.
3012;
3013; @param 1 The instruction
3014;
3015; @param A0 FPU context (fxsave).
3016; @param A1 Where to store the output FSW.
3017; @param A2 Pointer to the 80-bit value.
3018; @param A3 Pointer to the 64-bit value.
3019;
3020%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3022 PROLOGUE_4_ARGS
3023 sub xSP, 20h
3024
3025 fninit
3026 fld tword [A2]
3027 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3028 %1 dword [A3]
3029
3030 fnstsw word [A1]
3031
3032 fninit
3033 add xSP, 20h
3034 EPILOGUE_4_ARGS
3035ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3036%endmacro
3037
3038IEMIMPL_FPU_R80_BY_R32_FSW fcom
3039
3040
3041
3042;
3043;---------------------- 64-bit floating point operations ----------------------
3044;
3045
3046;;
3047; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3048;
3049; @param A0 FPU context (fxsave).
3050; @param A1 Pointer to a IEMFPURESULT for the output.
3051; @param A2 Pointer to the 64-bit floating point value to convert.
3052;
3053BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3054 PROLOGUE_3_ARGS
3055 sub xSP, 20h
3056
3057 fninit
3058 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3059 fld qword [A2]
3060
3061 fnstsw word [A1 + IEMFPURESULT.FSW]
3062 fnclex
3063 fstp tword [A1 + IEMFPURESULT.r80Result]
3064
3065 fninit
3066 add xSP, 20h
3067 EPILOGUE_3_ARGS
3068ENDPROC iemAImpl_fld_r80_from_r64
3069
3070
3071;;
3072; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3073;
3074; @param A0 FPU context (fxsave).
3075; @param A1 Where to return the output FSW.
3076; @param A2 Where to store the 64-bit value.
3077; @param A3 Pointer to the 80-bit value.
3078;
3079BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3080 PROLOGUE_4_ARGS
3081 sub xSP, 20h
3082
3083 fninit
3084 fld tword [A3]
3085 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3086 fst qword [A2]
3087
3088 fnstsw word [A1]
3089
3090 fninit
3091 add xSP, 20h
3092 EPILOGUE_4_ARGS
3093ENDPROC iemAImpl_fst_r80_to_r64
3094
3095
3096;;
3097; FPU instruction working on one 80-bit and one 64-bit floating point value.
3098;
3099; @param 1 The instruction
3100;
3101; @param A0 FPU context (fxsave).
3102; @param A1 Pointer to a IEMFPURESULT for the output.
3103; @param A2 Pointer to the 80-bit value.
3104; @param A3 Pointer to the 64-bit value.
3105;
3106%macro IEMIMPL_FPU_R80_BY_R64 1
3107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3108 PROLOGUE_4_ARGS
3109 sub xSP, 20h
3110
3111 fninit
3112 fld tword [A2]
3113 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3114 %1 qword [A3]
3115
3116 fnstsw word [A1 + IEMFPURESULT.FSW]
3117 fnclex
3118 fstp tword [A1 + IEMFPURESULT.r80Result]
3119
3120 fninit
3121 add xSP, 20h
3122 EPILOGUE_4_ARGS
3123ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3124%endmacro
3125
3126IEMIMPL_FPU_R80_BY_R64 fadd
3127IEMIMPL_FPU_R80_BY_R64 fmul
3128IEMIMPL_FPU_R80_BY_R64 fsub
3129IEMIMPL_FPU_R80_BY_R64 fsubr
3130IEMIMPL_FPU_R80_BY_R64 fdiv
3131IEMIMPL_FPU_R80_BY_R64 fdivr
3132
3133;;
3134; FPU instruction working on one 80-bit and one 64-bit floating point value,
3135; only returning FSW.
3136;
3137; @param 1 The instruction
3138;
3139; @param A0 FPU context (fxsave).
3140; @param A1 Where to store the output FSW.
3141; @param A2 Pointer to the 80-bit value.
3142; @param A3 Pointer to the 64-bit value.
3143;
3144%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3146 PROLOGUE_4_ARGS
3147 sub xSP, 20h
3148
3149 fninit
3150 fld tword [A2]
3151 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3152 %1 qword [A3]
3153
3154 fnstsw word [A1]
3155
3156 fninit
3157 add xSP, 20h
3158 EPILOGUE_4_ARGS
3159ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3160%endmacro
3161
3162IEMIMPL_FPU_R80_BY_R64_FSW fcom
3163
3164
3165
3166;
3167;---------------------- 80-bit floating point operations ----------------------
3168;
3169
3170;;
3171; Loads a 80-bit floating point register value from memory.
3172;
3173; @param A0 FPU context (fxsave).
3174; @param A1 Pointer to a IEMFPURESULT for the output.
3175; @param A2 Pointer to the 80-bit floating point value to load.
3176;
3177BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3178 PROLOGUE_3_ARGS
3179 sub xSP, 20h
3180
3181 fninit
3182 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3183 fld tword [A2]
3184
3185 fnstsw word [A1 + IEMFPURESULT.FSW]
3186 fnclex
3187 fstp tword [A1 + IEMFPURESULT.r80Result]
3188
3189 fninit
3190 add xSP, 20h
3191 EPILOGUE_3_ARGS
3192ENDPROC iemAImpl_fld_r80_from_r80
3193
3194
3195;;
3196; Store a 80-bit floating point register to memory
3197;
3198; @param A0 FPU context (fxsave).
3199; @param A1 Where to return the output FSW.
3200; @param A2 Where to store the 80-bit value.
3201; @param A3 Pointer to the 80-bit register value.
3202;
3203BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3204 PROLOGUE_4_ARGS
3205 sub xSP, 20h
3206
3207 fninit
3208 fld tword [A3]
3209 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3210 fstp tword [A2]
3211
3212 fnstsw word [A1]
3213
3214 fninit
3215 add xSP, 20h
3216 EPILOGUE_4_ARGS
3217ENDPROC iemAImpl_fst_r80_to_r80
3218
3219
3220;;
3221; Loads an 80-bit floating point register value in BCD format from memory.
3222;
3223; @param A0 FPU context (fxsave).
3224; @param A1 Pointer to a IEMFPURESULT for the output.
3225; @param A2 Pointer to the 80-bit BCD value to load.
3226;
3227BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3228 PROLOGUE_3_ARGS
3229 sub xSP, 20h
3230
3231 fninit
3232 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3233 fbld tword [A2]
3234
3235 fnstsw word [A1 + IEMFPURESULT.FSW]
3236 fnclex
3237 fstp tword [A1 + IEMFPURESULT.r80Result]
3238
3239 fninit
3240 add xSP, 20h
3241 EPILOGUE_3_ARGS
3242ENDPROC iemAImpl_fld_r80_from_d80
3243
3244
3245;;
3246; Store a 80-bit floating point register to memory as BCD
3247;
3248; @param A0 FPU context (fxsave).
3249; @param A1 Where to return the output FSW.
3250; @param A2 Where to store the 80-bit BCD value.
3251; @param A3 Pointer to the 80-bit register value.
3252;
3253BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3254 PROLOGUE_4_ARGS
3255 sub xSP, 20h
3256
3257 fninit
3258 fld tword [A3]
3259 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3260 fbstp tword [A2]
3261
3262 fnstsw word [A1]
3263
3264 fninit
3265 add xSP, 20h
3266 EPILOGUE_4_ARGS
3267ENDPROC iemAImpl_fst_r80_to_d80
3268
3269
3270;;
3271; FPU instruction working on two 80-bit floating point values.
3272;
3273; @param 1 The instruction
3274;
3275; @param A0 FPU context (fxsave).
3276; @param A1 Pointer to a IEMFPURESULT for the output.
3277; @param A2 Pointer to the first 80-bit value (ST0)
3278; @param A3 Pointer to the second 80-bit value (STn).
3279;
3280%macro IEMIMPL_FPU_R80_BY_R80 2
3281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3282 PROLOGUE_4_ARGS
3283 sub xSP, 20h
3284
3285 fninit
3286 fld tword [A3]
3287 fld tword [A2]
3288 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3289 %1 %2
3290
3291 fnstsw word [A1 + IEMFPURESULT.FSW]
3292 fnclex
3293 fstp tword [A1 + IEMFPURESULT.r80Result]
3294
3295 fninit
3296 add xSP, 20h
3297 EPILOGUE_4_ARGS
3298ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3299%endmacro
3300
3301IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3302IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3303IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3304IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3305IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3306IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3307IEMIMPL_FPU_R80_BY_R80 fprem, {}
3308IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3309IEMIMPL_FPU_R80_BY_R80 fscale, {}
3310
3311
3312;;
3313; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3314; storing the result in ST1 and popping the stack.
3315;
3316; @param 1 The instruction
3317;
3318; @param A0 FPU context (fxsave).
3319; @param A1 Pointer to a IEMFPURESULT for the output.
3320; @param A2 Pointer to the first 80-bit value (ST1).
3321; @param A3 Pointer to the second 80-bit value (ST0).
3322;
3323%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3324BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3325 PROLOGUE_4_ARGS
3326 sub xSP, 20h
3327
3328 fninit
3329 fld tword [A2]
3330 fld tword [A3]
3331 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3332 %1
3333
3334 fnstsw word [A1 + IEMFPURESULT.FSW]
3335 fnclex
3336 fstp tword [A1 + IEMFPURESULT.r80Result]
3337
3338 fninit
3339 add xSP, 20h
3340 EPILOGUE_4_ARGS
3341ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3342%endmacro
3343
3344IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3345IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3346IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3347
3348
3349;;
3350; FPU instruction working on two 80-bit floating point values, only
3351; returning FSW.
3352;
3353; @param 1 The instruction
3354;
3355; @param A0 FPU context (fxsave).
3356; @param A1 Pointer to a uint16_t for the resulting FSW.
3357; @param A2 Pointer to the first 80-bit value.
3358; @param A3 Pointer to the second 80-bit value.
3359;
3360%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3361BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3362 PROLOGUE_4_ARGS
3363 sub xSP, 20h
3364
3365 fninit
3366 fld tword [A3]
3367 fld tword [A2]
3368 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3369 %1 st0, st1
3370
3371 fnstsw word [A1]
3372
3373 fninit
3374 add xSP, 20h
3375 EPILOGUE_4_ARGS
3376ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3377%endmacro
3378
3379IEMIMPL_FPU_R80_BY_R80_FSW fcom
3380IEMIMPL_FPU_R80_BY_R80_FSW fucom
3381
3382
3383;;
3384; FPU instruction working on two 80-bit floating point values,
3385; returning FSW and EFLAGS (eax).
3386;
3387; @param 1 The instruction
3388;
3389; @returns EFLAGS in EAX.
3390; @param A0 FPU context (fxsave).
3391; @param A1 Pointer to a uint16_t for the resulting FSW.
3392; @param A2 Pointer to the first 80-bit value.
3393; @param A3 Pointer to the second 80-bit value.
3394;
3395%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3396BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3397 PROLOGUE_4_ARGS
3398 sub xSP, 20h
3399
3400 fninit
3401 fld tword [A3]
3402 fld tword [A2]
3403 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3404 %1 st1
3405
3406 fnstsw word [A1]
3407 pushf
3408 pop xAX
3409
3410 fninit
3411 add xSP, 20h
3412 EPILOGUE_4_ARGS
3413ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3414%endmacro
3415
3416IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3417IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3418
3419
3420;;
3421; FPU instruction working on one 80-bit floating point value.
3422;
3423; @param 1 The instruction
3424;
3425; @param A0 FPU context (fxsave).
3426; @param A1 Pointer to a IEMFPURESULT for the output.
3427; @param A2 Pointer to the 80-bit value.
3428;
3429%macro IEMIMPL_FPU_R80 1
3430BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3431 PROLOGUE_3_ARGS
3432 sub xSP, 20h
3433
3434 fninit
3435 fld tword [A2]
3436 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3437 %1
3438
3439 fnstsw word [A1 + IEMFPURESULT.FSW]
3440 fnclex
3441 fstp tword [A1 + IEMFPURESULT.r80Result]
3442
3443 fninit
3444 add xSP, 20h
3445 EPILOGUE_3_ARGS
3446ENDPROC iemAImpl_ %+ %1 %+ _r80
3447%endmacro
3448
3449IEMIMPL_FPU_R80 fchs
3450IEMIMPL_FPU_R80 fabs
3451IEMIMPL_FPU_R80 f2xm1
3452IEMIMPL_FPU_R80 fsqrt
3453IEMIMPL_FPU_R80 frndint
3454IEMIMPL_FPU_R80 fsin
3455IEMIMPL_FPU_R80 fcos
3456
3457
3458;;
3459; FPU instruction working on one 80-bit floating point value, only
3460; returning FSW.
3461;
3462; @param 1 The instruction
3463; @param 2 Non-zero to also restore FTW.
3464;
3465; @param A0 FPU context (fxsave).
3466; @param A1 Pointer to a uint16_t for the resulting FSW.
3467; @param A2 Pointer to the 80-bit value.
3468;
3469%macro IEMIMPL_FPU_R80_FSW 2
3470BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3471 PROLOGUE_3_ARGS
3472 sub xSP, 20h
3473
3474 fninit
3475 fld tword [A2]
3476%if %2 != 0
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3478%else
3479 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3480%endif
3481 %1
3482
3483 fnstsw word [A1]
3484
3485 fninit
3486 add xSP, 20h
3487 EPILOGUE_3_ARGS
3488ENDPROC iemAImpl_ %+ %1 %+ _r80
3489%endmacro
3490
3491IEMIMPL_FPU_R80_FSW ftst, 0
3492IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3493
3494
3495
3496;;
3497; FPU instruction loading a 80-bit floating point constant.
3498;
3499; @param 1 The instruction
3500;
3501; @param A0 FPU context (fxsave).
3502; @param A1 Pointer to a IEMFPURESULT for the output.
3503;
3504%macro IEMIMPL_FPU_R80_CONST 1
3505BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3506 PROLOGUE_2_ARGS
3507 sub xSP, 20h
3508
3509 fninit
3510 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3511 %1
3512
3513 fnstsw word [A1 + IEMFPURESULT.FSW]
3514 fnclex
3515 fstp tword [A1 + IEMFPURESULT.r80Result]
3516
3517 fninit
3518 add xSP, 20h
3519 EPILOGUE_2_ARGS
3520ENDPROC iemAImpl_ %+ %1 %+
3521%endmacro
3522
3523IEMIMPL_FPU_R80_CONST fld1
3524IEMIMPL_FPU_R80_CONST fldl2t
3525IEMIMPL_FPU_R80_CONST fldl2e
3526IEMIMPL_FPU_R80_CONST fldpi
3527IEMIMPL_FPU_R80_CONST fldlg2
3528IEMIMPL_FPU_R80_CONST fldln2
3529IEMIMPL_FPU_R80_CONST fldz
3530
3531
3532;;
3533; FPU instruction working on one 80-bit floating point value, outputing two.
3534;
3535; @param 1 The instruction
3536;
3537; @param A0 FPU context (fxsave).
3538; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3539; @param A2 Pointer to the 80-bit value.
3540;
3541%macro IEMIMPL_FPU_R80_R80 1
3542BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3543 PROLOGUE_3_ARGS
3544 sub xSP, 20h
3545
3546 fninit
3547 fld tword [A2]
3548 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3549 %1
3550
3551 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3552 fnclex
3553 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3554 fnclex
3555 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3556
3557 fninit
3558 add xSP, 20h
3559 EPILOGUE_3_ARGS
3560ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3561%endmacro
3562
3563IEMIMPL_FPU_R80_R80 fptan
3564IEMIMPL_FPU_R80_R80 fxtract
3565IEMIMPL_FPU_R80_R80 fsincos
3566
3567
3568
3569
3570;---------------------- SSE and MMX Operations ----------------------
3571
3572;; @todo what do we need to do for MMX?
3573%macro IEMIMPL_MMX_PROLOGUE 0
3574%endmacro
3575%macro IEMIMPL_MMX_EPILOGUE 0
3576%endmacro
3577
3578;; @todo what do we need to do for SSE?
3579%macro IEMIMPL_SSE_PROLOGUE 0
3580%endmacro
3581%macro IEMIMPL_SSE_EPILOGUE 0
3582%endmacro
3583
3584;; @todo what do we need to do for AVX?
3585%macro IEMIMPL_AVX_PROLOGUE 0
3586%endmacro
3587%macro IEMIMPL_AVX_EPILOGUE 0
3588%endmacro
3589
3590
3591;;
3592; Media instruction working on two full sized registers.
3593;
3594; @param 1 The instruction
3595; @param 2 Whether there is an MMX variant (1) or not (0).
3596;
3597; @param A0 FPU context (fxsave).
3598; @param A1 Pointer to the first media register size operand (input/output).
3599; @param A2 Pointer to the second media register size operand (input).
3600;
3601%macro IEMIMPL_MEDIA_F2 2
3602%if %2 != 0
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3604 PROLOGUE_3_ARGS
3605 IEMIMPL_MMX_PROLOGUE
3606
3607 movq mm0, [A1]
3608 movq mm1, [A2]
3609 %1 mm0, mm1
3610 movq [A1], mm0
3611
3612 IEMIMPL_MMX_EPILOGUE
3613 EPILOGUE_3_ARGS
3614ENDPROC iemAImpl_ %+ %1 %+ _u64
3615%endif
3616
3617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3618 PROLOGUE_3_ARGS
3619 IEMIMPL_SSE_PROLOGUE
3620
3621 movdqu xmm0, [A1]
3622 movdqu xmm1, [A2]
3623 %1 xmm0, xmm1
3624 movdqu [A1], xmm0
3625
3626 IEMIMPL_SSE_EPILOGUE
3627 EPILOGUE_3_ARGS
3628ENDPROC iemAImpl_ %+ %1 %+ _u128
3629%endmacro
3630
3631IEMIMPL_MEDIA_F2 pshufb, 1
3632IEMIMPL_MEDIA_F2 pand, 1
3633IEMIMPL_MEDIA_F2 pandn, 1
3634IEMIMPL_MEDIA_F2 por, 1
3635IEMIMPL_MEDIA_F2 pxor, 1
3636IEMIMPL_MEDIA_F2 pcmpeqb, 1
3637IEMIMPL_MEDIA_F2 pcmpeqw, 1
3638IEMIMPL_MEDIA_F2 pcmpeqd, 1
3639IEMIMPL_MEDIA_F2 pcmpeqq, 0
3640IEMIMPL_MEDIA_F2 pcmpgtb, 1
3641IEMIMPL_MEDIA_F2 pcmpgtw, 1
3642IEMIMPL_MEDIA_F2 pcmpgtd, 1
3643IEMIMPL_MEDIA_F2 pcmpgtq, 0
3644IEMIMPL_MEDIA_F2 paddb, 1
3645IEMIMPL_MEDIA_F2 paddw, 1
3646IEMIMPL_MEDIA_F2 paddd, 1
3647IEMIMPL_MEDIA_F2 paddq, 1
3648IEMIMPL_MEDIA_F2 paddsb, 1
3649IEMIMPL_MEDIA_F2 paddsw, 1
3650IEMIMPL_MEDIA_F2 paddusb, 1
3651IEMIMPL_MEDIA_F2 paddusw, 1
3652IEMIMPL_MEDIA_F2 psubb, 1
3653IEMIMPL_MEDIA_F2 psubw, 1
3654IEMIMPL_MEDIA_F2 psubd, 1
3655IEMIMPL_MEDIA_F2 psubq, 1
3656IEMIMPL_MEDIA_F2 psubsb, 1
3657IEMIMPL_MEDIA_F2 psubsw, 1
3658IEMIMPL_MEDIA_F2 psubusb, 1
3659IEMIMPL_MEDIA_F2 psubusw, 1
3660IEMIMPL_MEDIA_F2 pmullw, 1
3661IEMIMPL_MEDIA_F2 pmulld, 0
3662IEMIMPL_MEDIA_F2 pmulhw, 1
3663IEMIMPL_MEDIA_F2 pmaddwd, 1
3664IEMIMPL_MEDIA_F2 pminub, 1
3665IEMIMPL_MEDIA_F2 pminuw, 0
3666IEMIMPL_MEDIA_F2 pminud, 0
3667IEMIMPL_MEDIA_F2 pminsb, 0
3668IEMIMPL_MEDIA_F2 pminsw, 1
3669IEMIMPL_MEDIA_F2 pminsd, 0
3670IEMIMPL_MEDIA_F2 pmaxub, 1
3671IEMIMPL_MEDIA_F2 pmaxuw, 0
3672IEMIMPL_MEDIA_F2 pmaxud, 0
3673IEMIMPL_MEDIA_F2 pmaxsb, 0
3674IEMIMPL_MEDIA_F2 pmaxsw, 1
3675IEMIMPL_MEDIA_F2 pmaxsd, 0
3676IEMIMPL_MEDIA_F2 pabsb, 1
3677IEMIMPL_MEDIA_F2 pabsw, 1
3678IEMIMPL_MEDIA_F2 pabsd, 1
3679IEMIMPL_MEDIA_F2 psignb, 1
3680IEMIMPL_MEDIA_F2 psignw, 1
3681IEMIMPL_MEDIA_F2 psignd, 1
3682IEMIMPL_MEDIA_F2 phaddw, 1
3683IEMIMPL_MEDIA_F2 phaddd, 1
3684IEMIMPL_MEDIA_F2 phsubw, 1
3685IEMIMPL_MEDIA_F2 phsubd, 1
3686IEMIMPL_MEDIA_F2 phaddsw, 1
3687IEMIMPL_MEDIA_F2 phsubsw, 1
3688IEMIMPL_MEDIA_F2 pmaddubsw, 1
3689IEMIMPL_MEDIA_F2 pmulhrsw, 1
3690IEMIMPL_MEDIA_F2 pmuludq, 1
3691
3692
3693;;
3694; Media instruction working on two full sized registers, but no FXSAVE state argument.
3695;
3696; @param 1 The instruction
3697; @param 2 Whether there is an MMX variant (1) or not (0).
3698;
3699; @param A0 Pointer to the first media register size operand (input/output).
3700; @param A1 Pointer to the second media register size operand (input).
3701;
3702%macro IEMIMPL_MEDIA_OPT_F2 2
3703%if %2 != 0
3704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3705 PROLOGUE_2_ARGS
3706 IEMIMPL_MMX_PROLOGUE
3707
3708 movq mm0, [A0]
3709 movq mm1, [A1]
3710 %1 mm0, mm1
3711 movq [A0], mm0
3712
3713 IEMIMPL_MMX_EPILOGUE
3714 EPILOGUE_2_ARGS
3715ENDPROC iemAImpl_ %+ %1 %+ _u64
3716%endif
3717
3718BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3719 PROLOGUE_2_ARGS
3720 IEMIMPL_SSE_PROLOGUE
3721
3722 movdqu xmm0, [A0]
3723 movdqu xmm1, [A1]
3724 %1 xmm0, xmm1
3725 movdqu [A0], xmm0
3726
3727 IEMIMPL_SSE_EPILOGUE
3728 EPILOGUE_2_ARGS
3729ENDPROC iemAImpl_ %+ %1 %+ _u128
3730%endmacro
3731
3732IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3733IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3734IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3735IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3736IEMIMPL_MEDIA_OPT_F2 psllw, 1
3737IEMIMPL_MEDIA_OPT_F2 pslld, 1
3738IEMIMPL_MEDIA_OPT_F2 psllq, 1
3739IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3740IEMIMPL_MEDIA_OPT_F2 psrld, 1
3741IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3742IEMIMPL_MEDIA_OPT_F2 psraw, 1
3743IEMIMPL_MEDIA_OPT_F2 psrad, 1
3744IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3745IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3746IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3747IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3748IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3749IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3750IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3751IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3752IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3753IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3754IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3755IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3756IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3757IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3758IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3759IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3760IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3761IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3762IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3763IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3764
3765;;
3766; Media instruction working on one full sized and one half sized register (lower half).
3767;
3768; @param 1 The instruction
3769; @param 2 1 if MMX is included, 0 if not.
3770;
3771; @param A0 Pointer to the first full sized media register operand (input/output).
3772; @param A1 Pointer to the second half sized media register operand (input).
3773;
3774%macro IEMIMPL_MEDIA_F1L1 2
3775 %if %2 != 0
3776BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3777 PROLOGUE_2_ARGS
3778 IEMIMPL_MMX_PROLOGUE
3779
3780 movq mm0, [A0]
3781 movq mm1, [A1]
3782 %1 mm0, mm1
3783 movq [A0], mm0
3784
3785 IEMIMPL_MMX_EPILOGUE
3786 EPILOGUE_2_ARGS
3787ENDPROC iemAImpl_ %+ %1 %+ _u64
3788 %endif
3789
3790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3791 PROLOGUE_2_ARGS
3792 IEMIMPL_SSE_PROLOGUE
3793
3794 movdqu xmm0, [A0]
3795 movdqu xmm1, [A1]
3796 %1 xmm0, xmm1
3797 movdqu [A0], xmm0
3798
3799 IEMIMPL_SSE_EPILOGUE
3800 EPILOGUE_2_ARGS
3801ENDPROC iemAImpl_ %+ %1 %+ _u128
3802%endmacro
3803
3804IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3805IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3806IEMIMPL_MEDIA_F1L1 punpckldq, 1
3807IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3808
3809
3810;;
3811; Media instruction working two half sized input registers (lower half) and a full sized
3812; destination register (vpunpckh*).
3813;
3814; @param 1 The instruction
3815;
3816; @param A0 Pointer to the destination register (full sized, output only).
3817; @param A1 Pointer to the first full sized media source register operand, where we
3818; will only use the lower half as input - but we'll be loading it in full.
3819; @param A2 Pointer to the second full sized media source register operand, where we
3820; will only use the lower half as input - but we'll be loading it in full.
3821;
3822%macro IEMIMPL_MEDIA_F1L1L1 1
3823BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3824 PROLOGUE_3_ARGS
3825 IEMIMPL_AVX_PROLOGUE
3826
3827 vmovdqu xmm0, [A1]
3828 vmovdqu xmm1, [A2]
3829 %1 xmm0, xmm0, xmm1
3830 vmovdqu [A0], xmm0
3831
3832 IEMIMPL_AVX_PROLOGUE
3833 EPILOGUE_3_ARGS
3834ENDPROC iemAImpl_ %+ %1 %+ _u128
3835
3836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3837 PROLOGUE_3_ARGS
3838 IEMIMPL_AVX_PROLOGUE
3839
3840 vmovdqu ymm0, [A1]
3841 vmovdqu ymm1, [A2]
3842 %1 ymm0, ymm0, ymm1
3843 vmovdqu [A0], ymm0
3844
3845 IEMIMPL_AVX_PROLOGUE
3846 EPILOGUE_3_ARGS
3847ENDPROC iemAImpl_ %+ %1 %+ _u256
3848%endmacro
3849
3850IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3851IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3852IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3853IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3854
3855
3856;;
3857; Media instruction working on one full sized and one half sized register (high half).
3858;
3859; @param 1 The instruction
3860; @param 2 1 if MMX is included, 0 if not.
3861;
3862; @param A0 Pointer to the first full sized media register operand (input/output).
3863; @param A1 Pointer to the second full sized media register operand, where we
3864; will only use the upper half as input - but we'll load it in full.
3865;
3866%macro IEMIMPL_MEDIA_F1H1 2
3867IEMIMPL_MEDIA_F1L1 %1, %2
3868%endmacro
3869
3870IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3871IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3872IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3873IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3874
3875
3876;;
3877; Media instruction working two half sized input registers (high half) and a full sized
3878; destination register (vpunpckh*).
3879;
3880; @param 1 The instruction
3881;
3882; @param A0 Pointer to the destination register (full sized, output only).
3883; @param A1 Pointer to the first full sized media source register operand, where we
3884; will only use the upper half as input - but we'll be loading it in full.
3885; @param A2 Pointer to the second full sized media source register operand, where we
3886; will only use the upper half as input - but we'll be loading it in full.
3887;
3888%macro IEMIMPL_MEDIA_F1H1H1 1
3889IEMIMPL_MEDIA_F1L1L1 %1
3890%endmacro
3891
3892IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3893IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3894IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3895IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3896
3897
3898;
3899; Shufflers with evil 8-bit immediates.
3900;
3901
3902BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3903 PROLOGUE_3_ARGS
3904 IEMIMPL_MMX_PROLOGUE
3905
3906 movq mm1, [A1]
3907 movq mm0, mm0 ; paranoia!
3908 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3909 lea T1, [.imm0 xWrtRIP]
3910 lea T1, [T1 + T0]
3911 call T1
3912 movq [A0], mm0
3913
3914 IEMIMPL_MMX_EPILOGUE
3915 EPILOGUE_3_ARGS
3916%assign bImm 0
3917%rep 256
3918.imm %+ bImm:
3919 pshufw mm0, mm1, bImm
3920 ret
3921 %assign bImm bImm + 1
3922%endrep
3923.immEnd: ; 256*5 == 0x500
3924dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3925dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3926ENDPROC iemAImpl_pshufw_u64
3927
3928
3929%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3931 PROLOGUE_3_ARGS
3932 IEMIMPL_SSE_PROLOGUE
3933
3934 movdqu xmm1, [A1]
3935 movdqu xmm0, xmm1 ; paranoia!
3936 lea T1, [.imm0 xWrtRIP]
3937 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3938 lea T1, [T1 + T0*2]
3939 call T1
3940 movdqu [A0], xmm0
3941
3942 IEMIMPL_SSE_EPILOGUE
3943 EPILOGUE_3_ARGS
3944 %assign bImm 0
3945 %rep 256
3946.imm %+ bImm:
3947 %1 xmm0, xmm1, bImm
3948 ret
3949 %assign bImm bImm + 1
3950 %endrep
3951.immEnd: ; 256*6 == 0x600
3952dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3953dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3954ENDPROC iemAImpl_ %+ %1 %+ _u128
3955%endmacro
3956
3957IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3958IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3959IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3960
3961
3962%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3964 PROLOGUE_3_ARGS
3965 IEMIMPL_SSE_PROLOGUE
3966
3967 vmovdqu ymm1, [A1]
3968 vmovdqu ymm0, ymm1 ; paranoia!
3969 lea T1, [.imm0 xWrtRIP]
3970 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3971 lea T1, [T1 + T0*2]
3972 call T1
3973 vmovdqu [A0], ymm0
3974
3975 IEMIMPL_SSE_EPILOGUE
3976 EPILOGUE_3_ARGS
3977 %assign bImm 0
3978 %rep 256
3979.imm %+ bImm:
3980 %1 ymm0, ymm1, bImm
3981 ret
3982 %assign bImm bImm + 1
3983 %endrep
3984.immEnd: ; 256*6 == 0x600
3985dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3986dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3987ENDPROC iemAImpl_ %+ %1 %+ _u256
3988%endmacro
3989
3990IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3991IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3992IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3993
3994
3995;
3996; Shifts with evil 8-bit immediates.
3997;
3998
3999%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4000BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4001 PROLOGUE_2_ARGS
4002 IEMIMPL_MMX_PROLOGUE
4003
4004 movq mm0, [A0]
4005 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4006 lea T1, [.imm0 xWrtRIP]
4007 lea T1, [T1 + T0]
4008 call T1
4009 movq [A0], mm0
4010
4011 IEMIMPL_MMX_EPILOGUE
4012 EPILOGUE_2_ARGS
4013%assign bImm 0
4014%rep 256
4015.imm %+ bImm:
4016 %1 mm0, bImm
4017 ret
4018 %assign bImm bImm + 1
4019%endrep
4020.immEnd: ; 256*5 == 0x500
4021dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4022dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4023ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4024%endmacro
4025
4026IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4027IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4028IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4029IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4030IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4031IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4032IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4033IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4034
4035
4036%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4037BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4038 PROLOGUE_2_ARGS
4039 IEMIMPL_SSE_PROLOGUE
4040
4041 movdqu xmm0, [A0]
4042 lea T1, [.imm0 xWrtRIP]
4043 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4044 lea T1, [T1 + T0*2]
4045 call T1
4046 movdqu [A0], xmm0
4047
4048 IEMIMPL_SSE_EPILOGUE
4049 EPILOGUE_2_ARGS
4050 %assign bImm 0
4051 %rep 256
4052.imm %+ bImm:
4053 %1 xmm0, bImm
4054 ret
4055 %assign bImm bImm + 1
4056 %endrep
4057.immEnd: ; 256*6 == 0x600
4058dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4059dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4060ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4061%endmacro
4062
4063IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4064IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4065IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4066IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4067IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4068IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4069IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4070IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4071IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4072IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4073
4074
4075;
4076; Move byte mask.
4077;
4078
4079BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4080 PROLOGUE_2_ARGS
4081 IEMIMPL_MMX_PROLOGUE
4082
4083 movq mm1, [A1]
4084 pmovmskb T0, mm1
4085 mov [A0], T0
4086%ifdef RT_ARCH_X86
4087 mov dword [A0 + 4], 0
4088%endif
4089 IEMIMPL_MMX_EPILOGUE
4090 EPILOGUE_2_ARGS
4091ENDPROC iemAImpl_pmovmskb_u64
4092
4093BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4094 PROLOGUE_2_ARGS
4095 IEMIMPL_SSE_PROLOGUE
4096
4097 movdqu xmm1, [A1]
4098 pmovmskb T0, xmm1
4099 mov [A0], T0
4100%ifdef RT_ARCH_X86
4101 mov dword [A0 + 4], 0
4102%endif
4103 IEMIMPL_SSE_EPILOGUE
4104 EPILOGUE_2_ARGS
4105ENDPROC iemAImpl_pmovmskb_u128
4106
4107BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4108 PROLOGUE_2_ARGS
4109 IEMIMPL_AVX_PROLOGUE
4110
4111 vmovdqu ymm1, [A1]
4112 vpmovmskb T0, ymm1
4113 mov [A0], T0
4114%ifdef RT_ARCH_X86
4115 mov dword [A0 + 4], 0
4116%endif
4117 IEMIMPL_AVX_EPILOGUE
4118 EPILOGUE_2_ARGS
4119ENDPROC iemAImpl_vpmovmskb_u256
4120
4121
4122;;
4123; Media instruction working on two full sized source registers and one destination (AVX).
4124;
4125; @param 1 The instruction
4126;
4127; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4128; @param A1 Pointer to the destination media register size operand (output).
4129; @param A2 Pointer to the first source media register size operand (input).
4130; @param A3 Pointer to the second source media register size operand (input).
4131;
4132%macro IEMIMPL_MEDIA_F3 1
4133BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4134 PROLOGUE_4_ARGS
4135 IEMIMPL_AVX_PROLOGUE
4136
4137 vmovdqu xmm0, [A2]
4138 vmovdqu xmm1, [A3]
4139 %1 xmm0, xmm0, xmm1
4140 vmovdqu [A1], xmm0
4141
4142 IEMIMPL_AVX_PROLOGUE
4143 EPILOGUE_4_ARGS
4144ENDPROC iemAImpl_ %+ %1 %+ _u128
4145
4146BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4147 PROLOGUE_4_ARGS
4148 IEMIMPL_AVX_PROLOGUE
4149
4150 vmovdqu ymm0, [A2]
4151 vmovdqu ymm1, [A3]
4152 %1 ymm0, ymm0, ymm1
4153 vmovdqu [A1], ymm0
4154
4155 IEMIMPL_AVX_PROLOGUE
4156 EPILOGUE_4_ARGS
4157ENDPROC iemAImpl_ %+ %1 %+ _u256
4158%endmacro
4159
4160IEMIMPL_MEDIA_F3 vpshufb
4161IEMIMPL_MEDIA_F3 vpand
4162IEMIMPL_MEDIA_F3 vpminub
4163IEMIMPL_MEDIA_F3 vpminuw
4164IEMIMPL_MEDIA_F3 vpminud
4165IEMIMPL_MEDIA_F3 vpminsb
4166IEMIMPL_MEDIA_F3 vpminsw
4167IEMIMPL_MEDIA_F3 vpminsd
4168IEMIMPL_MEDIA_F3 vpmaxub
4169IEMIMPL_MEDIA_F3 vpmaxuw
4170IEMIMPL_MEDIA_F3 vpmaxud
4171IEMIMPL_MEDIA_F3 vpmaxsb
4172IEMIMPL_MEDIA_F3 vpmaxsw
4173IEMIMPL_MEDIA_F3 vpmaxsd
4174IEMIMPL_MEDIA_F3 vpandn
4175IEMIMPL_MEDIA_F3 vpor
4176IEMIMPL_MEDIA_F3 vpxor
4177IEMIMPL_MEDIA_F3 vpcmpeqb
4178IEMIMPL_MEDIA_F3 vpcmpeqw
4179IEMIMPL_MEDIA_F3 vpcmpeqd
4180IEMIMPL_MEDIA_F3 vpcmpeqq
4181IEMIMPL_MEDIA_F3 vpcmpgtb
4182IEMIMPL_MEDIA_F3 vpcmpgtw
4183IEMIMPL_MEDIA_F3 vpcmpgtd
4184IEMIMPL_MEDIA_F3 vpcmpgtq
4185IEMIMPL_MEDIA_F3 vpaddb
4186IEMIMPL_MEDIA_F3 vpaddw
4187IEMIMPL_MEDIA_F3 vpaddd
4188IEMIMPL_MEDIA_F3 vpaddq
4189IEMIMPL_MEDIA_F3 vpsubb
4190IEMIMPL_MEDIA_F3 vpsubw
4191IEMIMPL_MEDIA_F3 vpsubd
4192IEMIMPL_MEDIA_F3 vpsubq
4193
4194
4195;;
4196; Media instruction working on two full sized source registers and one destination (AVX),
4197; but no XSAVE state pointer argument.
4198;
4199; @param 1 The instruction
4200;
4201; @param A0 Pointer to the destination media register size operand (output).
4202; @param A1 Pointer to the first source media register size operand (input).
4203; @param A2 Pointer to the second source media register size operand (input).
4204;
4205%macro IEMIMPL_MEDIA_OPT_F3 1
4206BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4207 PROLOGUE_3_ARGS
4208 IEMIMPL_AVX_PROLOGUE
4209
4210 vmovdqu xmm0, [A1]
4211 vmovdqu xmm1, [A2]
4212 %1 xmm0, xmm0, xmm1
4213 vmovdqu [A0], xmm0
4214
4215 IEMIMPL_AVX_PROLOGUE
4216 EPILOGUE_3_ARGS
4217ENDPROC iemAImpl_ %+ %1 %+ _u128
4218
4219BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4220 PROLOGUE_3_ARGS
4221 IEMIMPL_AVX_PROLOGUE
4222
4223 vmovdqu ymm0, [A1]
4224 vmovdqu ymm1, [A2]
4225 %1 ymm0, ymm0, ymm1
4226 vmovdqu [A0], ymm0
4227
4228 IEMIMPL_AVX_PROLOGUE
4229 EPILOGUE_3_ARGS
4230ENDPROC iemAImpl_ %+ %1 %+ _u256
4231%endmacro
4232
4233IEMIMPL_MEDIA_OPT_F3 vpacksswb
4234IEMIMPL_MEDIA_OPT_F3 vpackssdw
4235IEMIMPL_MEDIA_OPT_F3 vpackuswb
4236IEMIMPL_MEDIA_OPT_F3 vpackusdw
4237IEMIMPL_MEDIA_OPT_F3 vpmullw
4238IEMIMPL_MEDIA_OPT_F3 vpmulld
4239IEMIMPL_MEDIA_OPT_F3 vpmulhw
4240IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4241IEMIMPL_MEDIA_OPT_F3 vpavgb
4242IEMIMPL_MEDIA_OPT_F3 vpavgw
4243IEMIMPL_MEDIA_OPT_F3 vpsignb
4244IEMIMPL_MEDIA_OPT_F3 vpsignw
4245IEMIMPL_MEDIA_OPT_F3 vpsignd
4246IEMIMPL_MEDIA_OPT_F3 vphaddw
4247IEMIMPL_MEDIA_OPT_F3 vphaddd
4248IEMIMPL_MEDIA_OPT_F3 vphsubw
4249IEMIMPL_MEDIA_OPT_F3 vphsubd
4250IEMIMPL_MEDIA_OPT_F3 vphaddsw
4251IEMIMPL_MEDIA_OPT_F3 vphsubsw
4252IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4253IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4254IEMIMPL_MEDIA_OPT_F3 vpsadbw
4255IEMIMPL_MEDIA_OPT_F3 vpmuldq
4256IEMIMPL_MEDIA_OPT_F3 vpmuludq
4257IEMIMPL_MEDIA_OPT_F3 vunpcklps
4258IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4259IEMIMPL_MEDIA_OPT_F3 vunpckhps
4260IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4261
4262;;
4263; Media instruction working on one full sized source registers and one destination (AVX),
4264; but no XSAVE state pointer argument.
4265;
4266; @param 1 The instruction
4267; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4268;
4269; @param A0 Pointer to the destination media register size operand (output).
4270; @param A1 Pointer to the source media register size operand (input).
4271;
4272%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4273BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4274 PROLOGUE_2_ARGS
4275 IEMIMPL_AVX_PROLOGUE
4276
4277 vmovdqu xmm0, [A1]
4278 %1 xmm0, xmm0
4279 vmovdqu [A0], xmm0
4280
4281 IEMIMPL_AVX_PROLOGUE
4282 EPILOGUE_2_ARGS
4283ENDPROC iemAImpl_ %+ %1 %+ _u128
4284
4285 %if %2 == 1
4286BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4287 PROLOGUE_2_ARGS
4288 IEMIMPL_AVX_PROLOGUE
4289
4290 vmovdqu ymm0, [A1]
4291 %1 ymm0, ymm0
4292 vmovdqu [A0], ymm0
4293
4294 IEMIMPL_AVX_PROLOGUE
4295 EPILOGUE_2_ARGS
4296ENDPROC iemAImpl_ %+ %1 %+ _u256
4297 %endif
4298%endmacro
4299
4300IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4301IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4302IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4303IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4304
4305
4306;
4307; The SSE 4.2 crc32
4308;
4309; @param A1 Pointer to the 32-bit destination.
4310; @param A2 The source operand, sized according to the suffix.
4311;
4312BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4313 PROLOGUE_2_ARGS
4314
4315 mov T0_32, [A0]
4316 crc32 T0_32, A1_8
4317 mov [A0], T0_32
4318
4319 EPILOGUE_2_ARGS
4320ENDPROC iemAImpl_crc32_u8
4321
4322BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4323 PROLOGUE_2_ARGS
4324
4325 mov T0_32, [A0]
4326 crc32 T0_32, A1_16
4327 mov [A0], T0_32
4328
4329 EPILOGUE_2_ARGS
4330ENDPROC iemAImpl_crc32_u16
4331
4332BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4333 PROLOGUE_2_ARGS
4334
4335 mov T0_32, [A0]
4336 crc32 T0_32, A1_32
4337 mov [A0], T0_32
4338
4339 EPILOGUE_2_ARGS
4340ENDPROC iemAImpl_crc32_u32
4341
4342%ifdef RT_ARCH_AMD64
4343BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4344 PROLOGUE_2_ARGS
4345
4346 mov T0_32, [A0]
4347 crc32 T0, A1
4348 mov [A0], T0_32
4349
4350 EPILOGUE_2_ARGS
4351ENDPROC iemAImpl_crc32_u64
4352%endif
4353
4354
4355;
4356; PTEST (SSE 4.1)
4357;
4358; @param A0 Pointer to the first source operand (aka readonly destination).
4359; @param A1 Pointer to the second source operand.
4360; @param A2 Pointer to the EFLAGS register.
4361;
4362BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4363 PROLOGUE_3_ARGS
4364 IEMIMPL_SSE_PROLOGUE
4365
4366 movdqu xmm0, [A0]
4367 movdqu xmm1, [A1]
4368 ptest xmm0, xmm1
4369 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4370
4371 IEMIMPL_SSE_EPILOGUE
4372 EPILOGUE_3_ARGS
4373ENDPROC iemAImpl_ptest_u128
4374
4375BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4376 PROLOGUE_3_ARGS
4377 IEMIMPL_SSE_PROLOGUE
4378
4379 vmovdqu ymm0, [A0]
4380 vmovdqu ymm1, [A1]
4381 vptest ymm0, ymm1
4382 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4383
4384 IEMIMPL_SSE_EPILOGUE
4385 EPILOGUE_3_ARGS
4386ENDPROC iemAImpl_vptest_u256
4387
4388
4389;;
4390; Template for the [v]pmov{s,z}x* instructions
4391;
4392; @param 1 The instruction
4393;
4394; @param A0 Pointer to the destination media register size operand (output).
4395; @param A1 The source operand value (input).
4396;
4397%macro IEMIMPL_V_PMOV_SZ_X 1
4398BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4399 PROLOGUE_2_ARGS
4400 IEMIMPL_SSE_PROLOGUE
4401
4402 movd xmm0, A1
4403 %1 xmm0, xmm0
4404 vmovdqu [A0], xmm0
4405
4406 IEMIMPL_SSE_PROLOGUE
4407 EPILOGUE_2_ARGS
4408ENDPROC iemAImpl_ %+ %1 %+ _u128
4409
4410BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4411 PROLOGUE_2_ARGS
4412 IEMIMPL_AVX_PROLOGUE
4413
4414 movd xmm0, A1
4415 v %+ %1 xmm0, xmm0
4416 vmovdqu [A0], xmm0
4417
4418 IEMIMPL_AVX_PROLOGUE
4419 EPILOGUE_2_ARGS
4420ENDPROC iemAImpl_v %+ %1 %+ _u128
4421
4422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4423 PROLOGUE_2_ARGS
4424 IEMIMPL_AVX_PROLOGUE
4425
4426 movdqu xmm0, [A1]
4427 v %+ %1 ymm0, xmm0
4428 vmovdqu [A0], ymm0
4429
4430 IEMIMPL_AVX_PROLOGUE
4431 EPILOGUE_2_ARGS
4432ENDPROC iemAImpl_v %+ %1 %+ _u256
4433%endmacro
4434
4435IEMIMPL_V_PMOV_SZ_X pmovsxbw
4436IEMIMPL_V_PMOV_SZ_X pmovsxbd
4437IEMIMPL_V_PMOV_SZ_X pmovsxbq
4438IEMIMPL_V_PMOV_SZ_X pmovsxwd
4439IEMIMPL_V_PMOV_SZ_X pmovsxwq
4440IEMIMPL_V_PMOV_SZ_X pmovsxdq
4441
4442IEMIMPL_V_PMOV_SZ_X pmovzxbw
4443IEMIMPL_V_PMOV_SZ_X pmovzxbd
4444IEMIMPL_V_PMOV_SZ_X pmovzxbq
4445IEMIMPL_V_PMOV_SZ_X pmovzxwd
4446IEMIMPL_V_PMOV_SZ_X pmovzxwq
4447IEMIMPL_V_PMOV_SZ_X pmovzxdq
4448
4449
4450;;
4451; Need to move this as well somewhere better?
4452;
4453struc IEMSSERESULT
4454 .uResult resd 4
4455 .MXCSR resd 1
4456endstruc
4457
4458
4459;;
4460; Need to move this as well somewhere better?
4461;
4462struc IEMAVX128RESULT
4463 .uResult resd 4
4464 .MXCSR resd 1
4465endstruc
4466
4467
4468;;
4469; Need to move this as well somewhere better?
4470;
4471struc IEMAVX256RESULT
4472 .uResult resd 8
4473 .MXCSR resd 1
4474endstruc
4475
4476
4477;;
4478; Initialize the SSE MXCSR register using the guest value partially to
4479; account for rounding mode.
4480;
4481; @uses 4 bytes of stack to save the original value, T0.
4482; @param 1 Expression giving the address of the FXSTATE of the guest.
4483;
4484%macro SSE_LD_FXSTATE_MXCSR 1
4485 sub xSP, 4
4486
4487 stmxcsr [xSP]
4488 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4489 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4490 or T0_32, X86_MXCSR_XCPT_MASK
4491 sub xSP, 4
4492 mov [xSP], T0_32
4493 ldmxcsr [xSP]
4494 add xSP, 4
4495%endmacro
4496
4497
4498;;
4499; Restores the SSE MXCSR register with the original value.
4500;
4501; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4502; @param 1 Expression giving the address where to return the MXCSR value.
4503; @param 2 Expression giving the address of the FXSTATE of the guest.
4504;
4505; @note Restores the stack pointer.
4506;
4507%macro SSE_ST_FXSTATE_MXCSR 2
4508 sub xSP, 4
4509 stmxcsr [xSP]
4510 mov T0_32, [xSP]
4511 add xSP, 4
4512 ; Merge the status bits into the original MXCSR value.
4513 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4514 and T0_32, X86_MXCSR_XCPT_FLAGS
4515 or T0_32, T1_32
4516 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4517
4518 ldmxcsr [xSP]
4519 add xSP, 4
4520%endmacro
4521
4522
4523;;
4524; Initialize the SSE MXCSR register using the guest value partially to
4525; account for rounding mode.
4526;
4527; @uses 4 bytes of stack to save the original value.
4528; @param 1 Expression giving the address of the FXSTATE of the guest.
4529;
4530%macro AVX_LD_XSAVEAREA_MXCSR 1
4531 sub xSP, 4
4532
4533 stmxcsr [xSP]
4534 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4535 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4536 sub xSP, 4
4537 mov [xSP], T0_32
4538 ldmxcsr [xSP]
4539 add xSP, 4
4540%endmacro
4541
4542
4543;;
4544; Restores the AVX128 MXCSR register with the original value.
4545;
4546; @param 1 Expression giving the address where to return the MXCSR value.
4547;
4548; @note Restores the stack pointer.
4549;
4550%macro AVX128_ST_XSAVEAREA_MXCSR 1
4551 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4552
4553 ldmxcsr [xSP]
4554 add xSP, 4
4555%endmacro
4556
4557
4558;;
4559; Restores the AVX256 MXCSR register with the original value.
4560;
4561; @param 1 Expression giving the address where to return the MXCSR value.
4562;
4563; @note Restores the stack pointer.
4564;
4565%macro AVX256_ST_XSAVEAREA_MXCSR 1
4566 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4567
4568 ldmxcsr [xSP]
4569 add xSP, 4
4570%endmacro
4571
4572
4573;;
4574; Floating point instruction working on two full sized registers.
4575;
4576; @param 1 The instruction
4577; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4578;
4579; @param A0 FPU context (FXSTATE or XSAVEAREA).
4580; @param A1 Where to return the result including the MXCSR value.
4581; @param A2 Pointer to the first media register size operand (input/output).
4582; @param A3 Pointer to the second media register size operand (input).
4583;
4584%macro IEMIMPL_FP_F2 2
4585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4586 PROLOGUE_4_ARGS
4587 IEMIMPL_SSE_PROLOGUE
4588 SSE_LD_FXSTATE_MXCSR A0
4589
4590 movdqu xmm0, [A2]
4591 movdqu xmm1, [A3]
4592 %1 xmm0, xmm1
4593 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4594
4595 SSE_ST_FXSTATE_MXCSR A1, A0
4596 IEMIMPL_SSE_PROLOGUE
4597 EPILOGUE_4_ARGS
4598ENDPROC iemAImpl_ %+ %1 %+ _u128
4599
4600 %if %2 == 3
4601BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4602 PROLOGUE_4_ARGS
4603 IEMIMPL_AVX_PROLOGUE
4604 AVX_LD_XSAVEAREA_MXCSR A0
4605
4606 vmovdqu xmm0, [A2]
4607 vmovdqu xmm1, [A3]
4608 v %+ %1 xmm0, xmm0, xmm1
4609 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4610
4611 AVX128_ST_XSAVEAREA_MXCSR A1
4612 IEMIMPL_AVX_PROLOGUE
4613 EPILOGUE_4_ARGS
4614ENDPROC iemAImpl_v %+ %1 %+ _u128
4615
4616BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4617 PROLOGUE_4_ARGS
4618 IEMIMPL_AVX_PROLOGUE
4619 AVX_LD_XSAVEAREA_MXCSR A0
4620
4621 vmovdqu ymm0, [A2]
4622 vmovdqu ymm1, [A3]
4623 v %+ %1 ymm0, ymm0, ymm1
4624 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4625
4626 AVX256_ST_XSAVEAREA_MXCSR A1
4627 IEMIMPL_AVX_PROLOGUE
4628 EPILOGUE_4_ARGS
4629ENDPROC iemAImpl_v %+ %1 %+ _u256
4630 %elif %2 == 2
4631BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4632 PROLOGUE_4_ARGS
4633 IEMIMPL_AVX_PROLOGUE
4634 AVX_LD_XSAVEAREA_MXCSR A0
4635
4636 vmovdqu xmm0, [A2]
4637 vmovdqu xmm1, [A3]
4638 v %+ %1 xmm0, xmm1
4639 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4640
4641 AVX128_ST_XSAVEAREA_MXCSR A1
4642 IEMIMPL_AVX_PROLOGUE
4643 EPILOGUE_4_ARGS
4644ENDPROC iemAImpl_v %+ %1 %+ _u128
4645
4646BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4647 PROLOGUE_4_ARGS
4648 IEMIMPL_AVX_PROLOGUE
4649 AVX_LD_XSAVEAREA_MXCSR A0
4650
4651 vmovdqu ymm0, [A2]
4652 vmovdqu ymm1, [A3]
4653 v %+ %1 ymm0, ymm1
4654 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4655
4656 AVX256_ST_XSAVEAREA_MXCSR A1
4657 IEMIMPL_AVX_PROLOGUE
4658 EPILOGUE_4_ARGS
4659ENDPROC iemAImpl_v %+ %1 %+ _u256
4660 %endif
4661%endmacro
4662
4663IEMIMPL_FP_F2 addps, 3
4664IEMIMPL_FP_F2 addpd, 3
4665IEMIMPL_FP_F2 mulps, 3
4666IEMIMPL_FP_F2 mulpd, 3
4667IEMIMPL_FP_F2 subps, 3
4668IEMIMPL_FP_F2 subpd, 3
4669IEMIMPL_FP_F2 minps, 3
4670IEMIMPL_FP_F2 minpd, 3
4671IEMIMPL_FP_F2 divps, 3
4672IEMIMPL_FP_F2 divpd, 3
4673IEMIMPL_FP_F2 maxps, 3
4674IEMIMPL_FP_F2 maxpd, 3
4675IEMIMPL_FP_F2 haddps, 3
4676IEMIMPL_FP_F2 haddpd, 3
4677IEMIMPL_FP_F2 hsubps, 3
4678IEMIMPL_FP_F2 hsubpd, 3
4679IEMIMPL_FP_F2 addsubps, 3
4680IEMIMPL_FP_F2 addsubpd, 3
4681
4682
4683;;
4684; These are actually unary operations but to keep it simple
4685; we treat them as binary for now, so the output result is
4686; always in sync with the register where the result might get written
4687; to.
4688IEMIMPL_FP_F2 sqrtps, 2
4689IEMIMPL_FP_F2 rsqrtps, 2
4690IEMIMPL_FP_F2 sqrtpd, 2
4691IEMIMPL_FP_F2 cvtdq2ps, 2
4692IEMIMPL_FP_F2 cvtps2dq, 2
4693IEMIMPL_FP_F2 cvttps2dq, 2
4694IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4695IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4696IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4697
4698
4699;;
4700; Floating point instruction working on a full sized register and a single precision operand.
4701;
4702; @param 1 The instruction
4703;
4704; @param A0 FPU context (FXSTATE or XSAVEAREA).
4705; @param A1 Where to return the result including the MXCSR value.
4706; @param A2 Pointer to the first media register size operand (input/output).
4707; @param A3 Pointer to the second single precision floating point value (input).
4708;
4709%macro IEMIMPL_FP_F2_R32 1
4710BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4711 PROLOGUE_4_ARGS
4712 IEMIMPL_SSE_PROLOGUE
4713 SSE_LD_FXSTATE_MXCSR A0
4714
4715 movdqu xmm0, [A2]
4716 movd xmm1, [A3]
4717 %1 xmm0, xmm1
4718 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4719
4720 SSE_ST_FXSTATE_MXCSR A1, A0
4721 IEMIMPL_SSE_EPILOGUE
4722 EPILOGUE_4_ARGS
4723ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4724
4725BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4726 PROLOGUE_4_ARGS
4727 IEMIMPL_AVX_PROLOGUE
4728 AVX_LD_XSAVEAREA_MXCSR A0
4729
4730 vmovdqu xmm0, [A2]
4731 vmovd xmm1, [A3]
4732 v %+ %1 xmm0, xmm0, xmm1
4733 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4734
4735 AVX128_ST_XSAVEAREA_MXCSR A1
4736 IEMIMPL_AVX_PROLOGUE
4737 EPILOGUE_4_ARGS
4738ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4739%endmacro
4740
4741IEMIMPL_FP_F2_R32 addss
4742IEMIMPL_FP_F2_R32 mulss
4743IEMIMPL_FP_F2_R32 subss
4744IEMIMPL_FP_F2_R32 minss
4745IEMIMPL_FP_F2_R32 divss
4746IEMIMPL_FP_F2_R32 maxss
4747IEMIMPL_FP_F2_R32 cvtss2sd
4748IEMIMPL_FP_F2_R32 sqrtss
4749IEMIMPL_FP_F2_R32 rsqrtss
4750
4751
4752;;
4753; Floating point instruction working on a full sized register and a double precision operand.
4754;
4755; @param 1 The instruction
4756;
4757; @param A0 FPU context (FXSTATE or XSAVEAREA).
4758; @param A1 Where to return the result including the MXCSR value.
4759; @param A2 Pointer to the first media register size operand (input/output).
4760; @param A3 Pointer to the second double precision floating point value (input).
4761;
4762%macro IEMIMPL_FP_F2_R64 1
4763BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4764 PROLOGUE_4_ARGS
4765 IEMIMPL_SSE_PROLOGUE
4766 SSE_LD_FXSTATE_MXCSR A0
4767
4768 movdqu xmm0, [A2]
4769 movq xmm1, [A3]
4770 %1 xmm0, xmm1
4771 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4772
4773 SSE_ST_FXSTATE_MXCSR A1, A0
4774 IEMIMPL_SSE_EPILOGUE
4775 EPILOGUE_4_ARGS
4776ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4777
4778BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4779 PROLOGUE_4_ARGS
4780 IEMIMPL_AVX_PROLOGUE
4781 AVX_LD_XSAVEAREA_MXCSR A0
4782
4783 vmovdqu xmm0, [A2]
4784 vmovq xmm1, [A3]
4785 v %+ %1 xmm0, xmm0, xmm1
4786 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4787
4788 AVX128_ST_XSAVEAREA_MXCSR A1
4789 IEMIMPL_AVX_EPILOGUE
4790 EPILOGUE_4_ARGS
4791ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4792%endmacro
4793
4794IEMIMPL_FP_F2_R64 addsd
4795IEMIMPL_FP_F2_R64 mulsd
4796IEMIMPL_FP_F2_R64 subsd
4797IEMIMPL_FP_F2_R64 minsd
4798IEMIMPL_FP_F2_R64 divsd
4799IEMIMPL_FP_F2_R64 maxsd
4800IEMIMPL_FP_F2_R64 cvtsd2ss
4801IEMIMPL_FP_F2_R64 sqrtsd
4802
4803
4804;;
4805; Macro for the cvtpd2ps/cvtps2pd instructions.
4806;
4807; 1 The instruction name.
4808; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4809;
4810; @param A0 FPU context (FXSTATE or XSAVEAREA).
4811; @param A1 Where to return the result including the MXCSR value.
4812; @param A2 Pointer to the first media register size operand (input/output).
4813; @param A3 Pointer to the second media register size operand (input).
4814;
4815%macro IEMIMPL_CVT_F2 2
4816BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4817 PROLOGUE_4_ARGS
4818 IEMIMPL_SSE_PROLOGUE
4819 SSE_LD_FXSTATE_MXCSR A0
4820
4821 movdqu xmm0, [A2]
4822 movdqu xmm1, [A3]
4823 %1 xmm0, xmm1
4824 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4825
4826 SSE_ST_FXSTATE_MXCSR A1, A0
4827 IEMIMPL_SSE_EPILOGUE
4828 EPILOGUE_4_ARGS
4829ENDPROC iemAImpl_ %+ %1 %+ _u128
4830
4831BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4832 PROLOGUE_4_ARGS
4833 IEMIMPL_AVX_PROLOGUE
4834 AVX_LD_XSAVEAREA_MXCSR A0
4835
4836 vmovdqu xmm0, [A2]
4837 vmovdqu xmm1, [A3]
4838 v %+ %1 xmm0, xmm1
4839 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4840
4841 AVX128_ST_XSAVEAREA_MXCSR A1
4842 IEMIMPL_AVX_EPILOGUE
4843 EPILOGUE_4_ARGS
4844ENDPROC iemAImpl_v %+ %1 %+ _u128
4845
4846BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4847 PROLOGUE_4_ARGS
4848 IEMIMPL_AVX_PROLOGUE
4849 AVX_LD_XSAVEAREA_MXCSR A0
4850
4851 vmovdqu ymm0, [A2]
4852 vmovdqu ymm1, [A3]
4853 %if %2 == 0
4854 v %+ %1 xmm0, ymm1
4855 %else
4856 v %+ %1 ymm0, xmm1
4857 %endif
4858 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4859
4860 AVX256_ST_XSAVEAREA_MXCSR A1
4861 IEMIMPL_AVX_EPILOGUE
4862 EPILOGUE_4_ARGS
4863ENDPROC iemAImpl_v %+ %1 %+ _u256
4864%endmacro
4865
4866IEMIMPL_CVT_F2 cvtpd2ps, 0
4867IEMIMPL_CVT_F2 cvtps2pd, 1
4868
4869
4870;;
4871; shufps instructions with 8-bit immediates.
4872;
4873; @param A0 Pointer to the destination media register size operand (input/output).
4874; @param A1 Pointer to the first source media register size operand (input).
4875; @param A2 The 8-bit immediate
4876;
4877BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4878 PROLOGUE_3_ARGS
4879 IEMIMPL_SSE_PROLOGUE
4880
4881 movdqu xmm0, [A0]
4882 movdqu xmm1, [A1]
4883 lea T1, [.imm0 xWrtRIP]
4884 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4885 lea T1, [T1 + T0*2]
4886 call T1
4887 movdqu [A0], xmm0
4888
4889 IEMIMPL_SSE_EPILOGUE
4890 EPILOGUE_3_ARGS
4891 %assign bImm 0
4892 %rep 256
4893.imm %+ bImm:
4894 shufps xmm0, xmm1, bImm
4895 ret
4896 int3
4897 %assign bImm bImm + 1
4898 %endrep
4899.immEnd: ; 256*6 == 0x600
4900dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4901dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4902ENDPROC iemAImpl_shufps_u128
4903
4904
4905;;
4906; shufpd instruction with 8-bit immediates.
4907;
4908; @param A0 Pointer to the destination media register size operand (input/output).
4909; @param A1 Pointer to the first source media register size operand (input).
4910; @param A2 The 8-bit immediate
4911;
4912BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4913 PROLOGUE_3_ARGS
4914 IEMIMPL_SSE_PROLOGUE
4915
4916 movdqu xmm0, [A0]
4917 movdqu xmm1, [A1]
4918 lea T1, [.imm0 xWrtRIP]
4919 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4920 lea T1, [T1 + T0*2]
4921 call T1
4922 movdqu [A0], xmm0
4923
4924 IEMIMPL_SSE_EPILOGUE
4925 EPILOGUE_3_ARGS
4926 %assign bImm 0
4927 %rep 256
4928.imm %+ bImm:
4929 shufpd xmm0, xmm1, bImm
4930 ret
4931 %assign bImm bImm + 1
4932 %endrep
4933.immEnd: ; 256*6 == 0x600
4934dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4935dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4936ENDPROC iemAImpl_shufpd_u128
4937
4938
4939;;
4940; vshufp{s,d} instructions with 8-bit immediates.
4941;
4942; @param 1 The instruction name.
4943;
4944; @param A0 Pointer to the destination media register size operand (output).
4945; @param A1 Pointer to the first source media register size operand (input).
4946; @param A2 Pointer to the second source media register size operand (input).
4947; @param A3 The 8-bit immediate
4948;
4949%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4950BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4951 PROLOGUE_4_ARGS
4952 IEMIMPL_AVX_PROLOGUE
4953
4954 movdqu xmm0, [A1]
4955 movdqu xmm1, [A2]
4956 lea T1, [.imm0 xWrtRIP]
4957 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4958 lea T1, [T1 + T0*2]
4959 call T1
4960 movdqu [A0], xmm0
4961
4962 IEMIMPL_AVX_EPILOGUE
4963 EPILOGUE_4_ARGS
4964 %assign bImm 0
4965 %rep 256
4966.imm %+ bImm:
4967 %1 xmm0, xmm0, xmm1, bImm
4968 ret
4969 %assign bImm bImm + 1
4970 %endrep
4971.immEnd: ; 256*6 == 0x600
4972dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4973dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4974ENDPROC iemAImpl_ %+ %1 %+ _u128
4975
4976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4977 PROLOGUE_4_ARGS
4978 IEMIMPL_AVX_PROLOGUE
4979
4980 vmovdqu ymm0, [A1]
4981 vmovdqu ymm1, [A2]
4982 lea T1, [.imm0 xWrtRIP]
4983 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4984 lea T1, [T1 + T0*2]
4985 call T1
4986 vmovdqu [A0], ymm0
4987
4988 IEMIMPL_AVX_EPILOGUE
4989 EPILOGUE_4_ARGS
4990 %assign bImm 0
4991 %rep 256
4992.imm %+ bImm:
4993 %1 ymm0, ymm0, ymm1, bImm
4994 ret
4995 %assign bImm bImm + 1
4996 %endrep
4997.immEnd: ; 256*6 == 0x600
4998dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4999dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5000ENDPROC iemAImpl_ %+ %1 %+ _u256
5001%endmacro
5002
5003IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5004IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5005
5006
5007;;
5008; One of the [p]blendv{b,ps,pd} variants
5009;
5010; @param 1 The instruction
5011;
5012; @param A0 Pointer to the first media register sized operand (input/output).
5013; @param A1 Pointer to the second media sized value (input).
5014; @param A2 Pointer to the media register sized mask value (input).
5015;
5016%macro IEMIMPL_P_BLEND 1
5017BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5018 PROLOGUE_3_ARGS
5019 IEMIMPL_SSE_PROLOGUE
5020
5021 movdqu xmm0, [A2] ; This is implicit
5022 movdqu xmm1, [A0]
5023 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5024 %1 xmm1, xmm2
5025 movdqu [A0], xmm1
5026
5027 IEMIMPL_SSE_PROLOGUE
5028 EPILOGUE_3_ARGS
5029ENDPROC iemAImpl_ %+ %1 %+ _u128
5030%endmacro
5031
5032IEMIMPL_P_BLEND pblendvb
5033IEMIMPL_P_BLEND blendvps
5034IEMIMPL_P_BLEND blendvpd
5035
5036
5037;;
5038; One of the v[p]blendv{b,ps,pd} variants
5039;
5040; @param 1 The instruction
5041;
5042; @param A0 Pointer to the first media register sized operand (output).
5043; @param A1 Pointer to the first media register sized operand (input).
5044; @param A2 Pointer to the second media register sized operand (input).
5045; @param A3 Pointer to the media register sized mask value (input).
5046%macro IEMIMPL_AVX_P_BLEND 1
5047BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5048 PROLOGUE_4_ARGS
5049 IEMIMPL_AVX_PROLOGUE
5050
5051 vmovdqu xmm0, [A1]
5052 vmovdqu xmm1, [A2]
5053 vmovdqu xmm2, [A3]
5054 %1 xmm0, xmm0, xmm1, xmm2
5055 vmovdqu [A0], xmm0
5056
5057 IEMIMPL_AVX_PROLOGUE
5058 EPILOGUE_4_ARGS
5059ENDPROC iemAImpl_ %+ %1 %+ _u128
5060
5061BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5062 PROLOGUE_4_ARGS
5063 IEMIMPL_AVX_PROLOGUE
5064
5065 vmovdqu ymm0, [A1]
5066 vmovdqu ymm1, [A2]
5067 vmovdqu ymm2, [A3]
5068 %1 ymm0, ymm0, ymm1, ymm2
5069 vmovdqu [A0], ymm0
5070
5071 IEMIMPL_AVX_PROLOGUE
5072 EPILOGUE_4_ARGS
5073ENDPROC iemAImpl_ %+ %1 %+ _u256
5074%endmacro
5075
5076IEMIMPL_AVX_P_BLEND vpblendvb
5077IEMIMPL_AVX_P_BLEND vblendvps
5078IEMIMPL_AVX_P_BLEND vblendvpd
5079
5080
5081;;
5082; palignr mm1, mm2/m64 instruction.
5083;
5084; @param A0 Pointer to the first media register sized operand (output).
5085; @param A1 The second register sized operand (input).
5086; @param A2 The 8-bit immediate.
5087BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5088 PROLOGUE_3_ARGS
5089 IEMIMPL_MMX_PROLOGUE
5090
5091 movq mm0, [A0]
5092 movq mm1, A1
5093 lea T1, [.imm0 xWrtRIP]
5094 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5095 lea T1, [T1 + T0*2]
5096 call T1
5097 movq [A0], mm0
5098
5099 IEMIMPL_MMX_EPILOGUE
5100 EPILOGUE_3_ARGS
5101 %assign bImm 0
5102 %rep 256
5103.imm %+ bImm:
5104 palignr mm0, mm1, bImm
5105 ret
5106 %assign bImm bImm + 1
5107 %endrep
5108.immEnd: ; 256*6 == 0x600
5109dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5110dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5111ENDPROC iemAImpl_palignr_u64
5112
5113
5114;;
5115; SSE instructions with 8-bit immediates of the form
5116; xxx xmm1, xmm2, imm8.
5117; where the instruction encoding takes up 6 bytes.
5118;
5119; @param 1 The instruction name.
5120;
5121; @param A0 Pointer to the first media register size operand (input/output).
5122; @param A1 Pointer to the second source media register size operand (input).
5123; @param A2 The 8-bit immediate
5124;
5125%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5127 PROLOGUE_3_ARGS
5128 IEMIMPL_SSE_PROLOGUE
5129
5130 movdqu xmm0, [A0]
5131 movdqu xmm1, [A1]
5132 lea T1, [.imm0 xWrtRIP]
5133 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5134 lea T1, [T1 + T0*2]
5135 call T1
5136 movdqu [A0], xmm0
5137
5138 IEMIMPL_SSE_EPILOGUE
5139 EPILOGUE_3_ARGS
5140 %assign bImm 0
5141 %rep 256
5142.imm %+ bImm:
5143 %1 xmm0, xmm1, bImm
5144 ret
5145 int3
5146 %assign bImm bImm + 1
5147 %endrep
5148.immEnd: ; 256*8 == 0x800
5149dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5150dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5151ENDPROC iemAImpl_ %+ %1 %+ _u128
5152%endmacro
5153
5154IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5155IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5156IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5157IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5158IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5159IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5160IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5161
5162
5163;;
5164; AVX instructions with 8-bit immediates of the form
5165; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5166; where the instruction encoding takes up 6 bytes.
5167;
5168; @param 1 The instruction name.
5169; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5170;
5171; @param A0 Pointer to the destination media register size operand (output).
5172; @param A1 Pointer to the first source media register size operand (input).
5173; @param A2 Pointer to the second source media register size operand (input).
5174; @param A3 The 8-bit immediate
5175;
5176%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5177BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5178 PROLOGUE_4_ARGS
5179 IEMIMPL_AVX_PROLOGUE
5180
5181 movdqu xmm0, [A1]
5182 movdqu xmm1, [A2]
5183 lea T1, [.imm0 xWrtRIP]
5184 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5185 lea T1, [T1 + T0*2]
5186 call T1
5187 movdqu [A0], xmm0
5188
5189 IEMIMPL_AVX_EPILOGUE
5190 EPILOGUE_4_ARGS
5191 %assign bImm 0
5192 %rep 256
5193.imm %+ bImm:
5194 %1 xmm0, xmm0, xmm1, bImm
5195 ret
5196 int3
5197 %assign bImm bImm + 1
5198 %endrep
5199.immEnd: ; 256*8 == 0x800
5200dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5201dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5202ENDPROC iemAImpl_ %+ %1 %+ _u128
5203
5204 %if %2 == 1
5205BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5206 PROLOGUE_4_ARGS
5207 IEMIMPL_AVX_PROLOGUE
5208
5209 vmovdqu ymm0, [A1]
5210 vmovdqu ymm1, [A2]
5211 lea T1, [.imm0 xWrtRIP]
5212 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5213 lea T1, [T1 + T0*2]
5214 call T1
5215 vmovdqu [A0], ymm0
5216
5217 IEMIMPL_AVX_EPILOGUE
5218 EPILOGUE_4_ARGS
5219 %assign bImm 0
5220 %rep 256
5221.imm %+ bImm:
5222 %1 ymm0, ymm0, ymm1, bImm
5223 ret
5224 int3
5225 %assign bImm bImm + 1
5226 %endrep
5227.immEnd: ; 256*8 == 0x800
5228dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5229dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5230ENDPROC iemAImpl_ %+ %1 %+ _u256
5231 %endif
5232%endmacro
5233
5234IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5235IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5236IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5237IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5238IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5239
5240
5241;;
5242; Need to move this as well somewhere better?
5243;
5244struc IEMPCMPISTRXSRC
5245 .uSrc1 resd 4
5246 .uSrc2 resd 4
5247endstruc
5248
5249struc IEMPCMPESTRXSRC
5250 .uSrc1 resd 4
5251 .uSrc2 resd 4
5252 .u64Rax resd 2
5253 .u64Rdx resd 2
5254endstruc
5255
5256;;
5257; The pcmpistri instruction.
5258;
5259; @param A0 Pointer to the ECX register to store the result to (output).
5260; @param A1 Pointer to the EFLAGS register.
5261; @param A2 Pointer to the structure containing the source operands (input).
5262; @param A3 The 8-bit immediate
5263;
5264BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5265 PROLOGUE_4_ARGS
5266 IEMIMPL_SSE_PROLOGUE
5267
5268 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5269 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5270 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5271 lea T1, [.imm0 xWrtRIP]
5272 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5273 lea T1, [T1 + T0*2]
5274 call T1
5275
5276 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5277 mov [T2], ecx
5278
5279 IEMIMPL_SSE_EPILOGUE
5280 EPILOGUE_4_ARGS
5281 %assign bImm 0
5282 %rep 256
5283.imm %+ bImm:
5284 pcmpistri xmm0, xmm1, bImm
5285 ret
5286 int3
5287 %assign bImm bImm + 1
5288 %endrep
5289.immEnd: ; 256*8 == 0x800
5290dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5291dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5292ENDPROC iemAImpl_pcmpistri_u128
5293
5294;;
5295; The pcmpestri instruction.
5296;
5297; @param A0 Pointer to the ECX register to store the result to (output).
5298; @param A1 Pointer to the EFLAGS register.
5299; @param A2 Pointer to the structure containing the source operands (input).
5300; @param A3 The 8-bit immediate
5301;
5302BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5303 PROLOGUE_4_ARGS
5304 IEMIMPL_SSE_PROLOGUE
5305
5306 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5307 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5308 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5309 lea T1, [.imm0 xWrtRIP]
5310 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5311 lea T1, [T1 + T0*2]
5312 push xDX ; xDX can be A1 or A2 depending on the calling convention
5313 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5314 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5315 call T1
5316
5317 pop xDX
5318 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5319 mov [T2], ecx
5320
5321 IEMIMPL_SSE_EPILOGUE
5322 EPILOGUE_4_ARGS
5323 %assign bImm 0
5324 %rep 256
5325.imm %+ bImm:
5326 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5327 pcmpestri xmm0, xmm1, bImm
5328 ret
5329 %assign bImm bImm + 1
5330 %endrep
5331.immEnd: ; 256*8 == 0x800
5332dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5333dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5334ENDPROC iemAImpl_pcmpestri_u128
5335
5336;;
5337; The pcmpistrm instruction template.
5338;
5339; @param A0 Pointer to the XMM0 register to store the result to (output).
5340; @param A1 Pointer to the EFLAGS register.
5341; @param A2 Pointer to the structure containing the source operands (input).
5342; @param A3 The 8-bit immediate
5343;
5344BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5345 PROLOGUE_4_ARGS
5346 IEMIMPL_SSE_PROLOGUE
5347
5348 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5349 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5350 lea T1, [.imm0 xWrtRIP]
5351 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5352 lea T1, [T1 + T0*2]
5353 call T1
5354
5355 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5356 movdqu [A0], xmm0
5357
5358 IEMIMPL_SSE_EPILOGUE
5359 EPILOGUE_4_ARGS
5360 %assign bImm 0
5361 %rep 256
5362.imm %+ bImm:
5363 pcmpistrm xmm1, xmm2, bImm
5364 ret
5365 int3
5366 %assign bImm bImm + 1
5367 %endrep
5368.immEnd: ; 256*8 == 0x800
5369dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5370dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5371ENDPROC iemAImpl_pcmpistrm_u128
5372
5373;;
5374; The pcmpestrm instruction template.
5375;
5376; @param A0 Pointer to the XMM0 register to store the result to (output).
5377; @param A1 Pointer to the EFLAGS register.
5378; @param A2 Pointer to the structure containing the source operands (input).
5379; @param A3 The 8-bit immediate
5380;
5381BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5382 PROLOGUE_4_ARGS
5383 IEMIMPL_SSE_PROLOGUE
5384
5385 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5386 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5387 lea T1, [.imm0 xWrtRIP]
5388 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5389 lea T1, [T1 + T0*2]
5390 push xDX ; xDX can be A1 or A2 depending on the calling convention
5391 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5392 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5393 call T1
5394
5395 pop xDX
5396 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5397 movdqu [A0], xmm0
5398
5399 IEMIMPL_SSE_EPILOGUE
5400 EPILOGUE_4_ARGS
5401 %assign bImm 0
5402 %rep 256
5403.imm %+ bImm:
5404 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5405 pcmpestrm xmm1, xmm2, bImm
5406 ret
5407 %assign bImm bImm + 1
5408 %endrep
5409.immEnd: ; 256*8 == 0x800
5410dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5411dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5412ENDPROC iemAImpl_pcmpestrm_u128
5413
5414
5415;;
5416; pinsrw instruction.
5417;
5418; @param A0 Pointer to the first media register size operand (input/output).
5419; @param A1 The 16 bit input operand (input).
5420; @param A2 The 8-bit immediate
5421;
5422BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5423 PROLOGUE_3_ARGS
5424 IEMIMPL_SSE_PROLOGUE
5425
5426 movq mm0, [A0]
5427 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5428 lea T1, [.imm0 xWrtRIP]
5429 lea T1, [T1 + T0]
5430 call T1
5431 movq [A0], mm0
5432
5433 IEMIMPL_SSE_EPILOGUE
5434 EPILOGUE_3_ARGS
5435 %assign bImm 0
5436 %rep 256
5437.imm %+ bImm:
5438 pinsrw mm0, A1_32, bImm
5439 ret
5440 %assign bImm bImm + 1
5441 %endrep
5442.immEnd: ; 256*5 == 0x500
5443dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5444dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5445ENDPROC iemAImpl_pinsrw_u64
5446
5447BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5448 PROLOGUE_3_ARGS
5449 IEMIMPL_SSE_PROLOGUE
5450
5451 movdqu xmm0, [A0]
5452 lea T1, [.imm0 xWrtRIP]
5453 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5454 lea T1, [T1 + T0*2]
5455 call T1
5456 movdqu [A0], xmm0
5457
5458 IEMIMPL_SSE_EPILOGUE
5459 EPILOGUE_3_ARGS
5460 %assign bImm 0
5461 %rep 256
5462.imm %+ bImm:
5463 pinsrw xmm0, A1_32, bImm
5464 ret
5465 %assign bImm bImm + 1
5466 %endrep
5467.immEnd: ; 256*6 == 0x600
5468dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5469dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5470ENDPROC iemAImpl_pinsrw_u128
5471
5472;;
5473; vpinsrw instruction.
5474;
5475; @param A0 Pointer to the first media register size operand (output).
5476; @param A1 Pointer to the source media register size operand (input).
5477; @param A2 The 16 bit input operand (input).
5478; @param A3 The 8-bit immediate
5479;
5480BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5481 PROLOGUE_4_ARGS
5482 IEMIMPL_SSE_PROLOGUE
5483
5484 movdqu xmm0, [A1]
5485 lea T1, [.imm0 xWrtRIP]
5486 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5487 lea T1, [T1 + T0*2]
5488 mov A1, A2 ; A2 requires longer encoding on Windows
5489 call T1
5490 movdqu [A0], xmm0
5491
5492 IEMIMPL_SSE_EPILOGUE
5493 EPILOGUE_4_ARGS
5494 %assign bImm 0
5495 %rep 256
5496.imm %+ bImm:
5497 vpinsrw xmm0, xmm0, A1_32, bImm
5498 ret
5499 %assign bImm bImm + 1
5500 %endrep
5501.immEnd: ; 256*6 == 0x600
5502dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5503dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5504ENDPROC iemAImpl_vpinsrw_u128
5505
5506
5507;;
5508; pextrw instruction.
5509;
5510; @param A0 Pointer to the 16bit output operand (output).
5511; @param A1 Pointer to the media register size operand (input).
5512; @param A2 The 8-bit immediate
5513;
5514BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5515 PROLOGUE_3_ARGS
5516 IEMIMPL_SSE_PROLOGUE
5517
5518 movq mm0, A1
5519 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5520 lea T1, [.imm0 xWrtRIP]
5521 lea T1, [T1 + T0]
5522 call T1
5523 mov word [A0], T0_16
5524
5525 IEMIMPL_SSE_EPILOGUE
5526 EPILOGUE_3_ARGS
5527 %assign bImm 0
5528 %rep 256
5529.imm %+ bImm:
5530 pextrw T0_32, mm0, bImm
5531 ret
5532 %assign bImm bImm + 1
5533 %endrep
5534.immEnd: ; 256*5 == 0x500
5535dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5536dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5537ENDPROC iemAImpl_pextrw_u64
5538
5539BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5540 PROLOGUE_3_ARGS
5541 IEMIMPL_SSE_PROLOGUE
5542
5543 movdqu xmm0, [A1]
5544 lea T1, [.imm0 xWrtRIP]
5545 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5546 lea T1, [T1 + T0*2]
5547 call T1
5548 mov word [A0], T0_16
5549
5550 IEMIMPL_SSE_EPILOGUE
5551 EPILOGUE_3_ARGS
5552 %assign bImm 0
5553 %rep 256
5554.imm %+ bImm:
5555 pextrw T0_32, xmm0, bImm
5556 ret
5557 %assign bImm bImm + 1
5558 %endrep
5559.immEnd: ; 256*6 == 0x600
5560dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5561dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5562ENDPROC iemAImpl_pextrw_u128
5563
5564;;
5565; vpextrw instruction.
5566;
5567; @param A0 Pointer to the 16bit output operand (output).
5568; @param A1 Pointer to the source media register size operand (input).
5569; @param A2 The 8-bit immediate
5570;
5571BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5572 PROLOGUE_3_ARGS
5573 IEMIMPL_SSE_PROLOGUE
5574
5575 movdqu xmm0, [A1]
5576 lea T1, [.imm0 xWrtRIP]
5577 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5578 lea T1, [T1 + T0*2]
5579 call T1
5580 mov word [A0], T0_16
5581
5582 IEMIMPL_SSE_EPILOGUE
5583 EPILOGUE_3_ARGS
5584 %assign bImm 0
5585 %rep 256
5586.imm %+ bImm:
5587 vpextrw T0_32, xmm0, bImm
5588 ret
5589 %assign bImm bImm + 1
5590 %endrep
5591.immEnd: ; 256*6 == 0x600
5592dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5593dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5594ENDPROC iemAImpl_vpextrw_u128
5595
5596
5597;;
5598; movmskp{s,d} SSE instruction template
5599;
5600; @param 1 The SSE instruction name.
5601; @param 2 The AVX instruction name.
5602;
5603; @param A0 Pointer to the output register (output/byte sized).
5604; @param A1 Pointer to the source media register size operand (input).
5605;
5606%macro IEMIMPL_MEDIA_MOVMSK_P 2
5607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5608 PROLOGUE_2_ARGS
5609 IEMIMPL_SSE_PROLOGUE
5610
5611 movdqu xmm0, [A1]
5612 %1 T0, xmm0
5613 mov byte [A0], T0_8
5614
5615 IEMIMPL_SSE_EPILOGUE
5616 EPILOGUE_2_ARGS
5617ENDPROC iemAImpl_ %+ %1 %+ _u128
5618
5619BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5620 PROLOGUE_2_ARGS
5621 IEMIMPL_AVX_PROLOGUE
5622
5623 movdqu xmm0, [A1]
5624 %2 T0, xmm0
5625 mov byte [A0], T0_8
5626
5627 IEMIMPL_AVX_EPILOGUE
5628 EPILOGUE_2_ARGS
5629ENDPROC iemAImpl_ %+ %2 %+ _u128
5630
5631BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5632 PROLOGUE_2_ARGS
5633 IEMIMPL_AVX_PROLOGUE
5634
5635 vmovdqu ymm0, [A1]
5636 %2 T0, ymm0
5637 mov byte [A0], T0_8
5638
5639 IEMIMPL_AVX_EPILOGUE
5640 EPILOGUE_2_ARGS
5641ENDPROC iemAImpl_ %+ %2 %+ _u256
5642%endmacro
5643
5644IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5645IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5646
5647
5648;;
5649; Restores the SSE MXCSR register with the original value.
5650;
5651; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5652; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5653; @param 2 Expression giving the address of the FXSTATE of the guest.
5654;
5655; @note Restores the stack pointer.
5656;
5657%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5658 sub xSP, 4
5659 stmxcsr [xSP]
5660 mov T0_32, [xSP]
5661 add xSP, 4
5662 ; Merge the status bits into the original MXCSR value.
5663 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5664 and T0_32, X86_MXCSR_XCPT_FLAGS
5665 or T0_32, T1_32
5666 mov [%1], T0_32
5667
5668 ldmxcsr [xSP]
5669 add xSP, 4
5670%endmacro
5671
5672
5673;;
5674; cvttsd2si instruction - 32-bit variant.
5675;
5676; @param A0 FPU context (FXSTATE or XSAVEAREA).
5677; @param A1 Where to return the MXCSR value.
5678; @param A2 Pointer to the result operand (output).
5679; @param A3 Pointer to the second operand (input).
5680;
5681BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5682 PROLOGUE_4_ARGS
5683 IEMIMPL_SSE_PROLOGUE
5684 SSE_LD_FXSTATE_MXCSR A0
5685
5686 cvttsd2si T0_32, [A3]
5687 mov dword [A2], T0_32
5688
5689 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5690 IEMIMPL_SSE_EPILOGUE
5691 EPILOGUE_4_ARGS
5692ENDPROC iemAImpl_cvttsd2si_i32_r64
5693
5694;;
5695; cvttsd2si instruction - 64-bit variant.
5696;
5697; @param A0 FPU context (FXSTATE or XSAVEAREA).
5698; @param A1 Where to return the MXCSR value.
5699; @param A2 Pointer to the result operand (output).
5700; @param A3 Pointer to the second operand (input).
5701;
5702BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5703 PROLOGUE_4_ARGS
5704 IEMIMPL_SSE_PROLOGUE
5705 SSE_LD_FXSTATE_MXCSR A0
5706
5707 cvttsd2si T0, [A3]
5708 mov qword [A2], T0
5709
5710 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5711 IEMIMPL_SSE_EPILOGUE
5712 EPILOGUE_4_ARGS
5713ENDPROC iemAImpl_cvttsd2si_i64_r64
5714
5715
5716;;
5717; cvtsd2si instruction - 32-bit variant.
5718;
5719; @param A0 FPU context (FXSTATE or XSAVEAREA).
5720; @param A1 Where to return the MXCSR value.
5721; @param A2 Pointer to the result operand (output).
5722; @param A3 Pointer to the second operand (input).
5723;
5724BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5725 PROLOGUE_4_ARGS
5726 IEMIMPL_SSE_PROLOGUE
5727 SSE_LD_FXSTATE_MXCSR A0
5728
5729 cvtsd2si T0_32, [A3]
5730 mov dword [A2], T0_32
5731
5732 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5733 IEMIMPL_SSE_EPILOGUE
5734 EPILOGUE_4_ARGS
5735ENDPROC iemAImpl_cvtsd2si_i32_r64
5736
5737;;
5738; cvtsd2si instruction - 64-bit variant.
5739;
5740; @param A0 FPU context (FXSTATE or XSAVEAREA).
5741; @param A1 Where to return the MXCSR value.
5742; @param A2 Pointer to the result operand (output).
5743; @param A3 Pointer to the second operand (input).
5744;
5745BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5746 PROLOGUE_4_ARGS
5747 IEMIMPL_SSE_PROLOGUE
5748 SSE_LD_FXSTATE_MXCSR A0
5749
5750 cvtsd2si T0, [A3]
5751 mov qword [A2], T0
5752
5753 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5754 IEMIMPL_SSE_EPILOGUE
5755 EPILOGUE_4_ARGS
5756ENDPROC iemAImpl_cvtsd2si_i64_r64
5757
5758
5759;;
5760; cvttss2si instruction - 32-bit variant.
5761;
5762; @param A0 FPU context (FXSTATE or XSAVEAREA).
5763; @param A1 Where to return the MXCSR value.
5764; @param A2 Pointer to the result operand (output).
5765; @param A3 Pointer to the second operand (input).
5766;
5767BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5768 PROLOGUE_4_ARGS
5769 IEMIMPL_SSE_PROLOGUE
5770 SSE_LD_FXSTATE_MXCSR A0
5771
5772 cvttss2si T0_32, [A3]
5773 mov dword [A2], T0_32
5774
5775 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5776 IEMIMPL_SSE_EPILOGUE
5777 EPILOGUE_4_ARGS
5778ENDPROC iemAImpl_cvttss2si_i32_r32
5779
5780;;
5781; cvttss2si instruction - 64-bit variant.
5782;
5783; @param A0 FPU context (FXSTATE or XSAVEAREA).
5784; @param A1 Where to return the MXCSR value.
5785; @param A2 Pointer to the result operand (output).
5786; @param A3 Pointer to the second operand (input).
5787;
5788BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5789 PROLOGUE_4_ARGS
5790 IEMIMPL_SSE_PROLOGUE
5791 SSE_LD_FXSTATE_MXCSR A0
5792
5793 cvttss2si T0, [A3]
5794 mov qword [A2], T0
5795
5796 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5797 IEMIMPL_SSE_EPILOGUE
5798 EPILOGUE_4_ARGS
5799ENDPROC iemAImpl_cvttss2si_i64_r32
5800
5801
5802;;
5803; cvtss2si instruction - 32-bit variant.
5804;
5805; @param A0 FPU context (FXSTATE or XSAVEAREA).
5806; @param A1 Where to return the MXCSR value.
5807; @param A2 Pointer to the result operand (output).
5808; @param A3 Pointer to the second operand (input).
5809;
5810BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5811 PROLOGUE_4_ARGS
5812 IEMIMPL_SSE_PROLOGUE
5813 SSE_LD_FXSTATE_MXCSR A0
5814
5815 cvtss2si T0_32, [A3]
5816 mov dword [A2], T0_32
5817
5818 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5819 IEMIMPL_SSE_EPILOGUE
5820 EPILOGUE_4_ARGS
5821ENDPROC iemAImpl_cvtss2si_i32_r32
5822
5823;;
5824; cvtss2si instruction - 64-bit variant.
5825;
5826; @param A0 FPU context (FXSTATE or XSAVEAREA).
5827; @param A1 Where to return the MXCSR value.
5828; @param A2 Pointer to the result operand (output).
5829; @param A3 Pointer to the second operand (input).
5830;
5831BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5832 PROLOGUE_4_ARGS
5833 IEMIMPL_SSE_PROLOGUE
5834 SSE_LD_FXSTATE_MXCSR A0
5835
5836 cvtss2si T0, [A3]
5837 mov qword [A2], T0
5838
5839 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5840 IEMIMPL_SSE_EPILOGUE
5841 EPILOGUE_4_ARGS
5842ENDPROC iemAImpl_cvtss2si_i64_r32
5843
5844
5845;;
5846; cvtsi2ss instruction - 32-bit variant.
5847;
5848; @param A0 FPU context (FXSTATE or XSAVEAREA).
5849; @param A1 Where to return the MXCSR value.
5850; @param A2 Pointer to the result operand (output).
5851; @param A3 Pointer to the second operand (input).
5852;
5853BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5854 PROLOGUE_4_ARGS
5855 IEMIMPL_SSE_PROLOGUE
5856 SSE_LD_FXSTATE_MXCSR A0
5857
5858 cvtsi2ss xmm0, dword [A3]
5859 movd dword [A2], xmm0
5860
5861 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5862 IEMIMPL_SSE_EPILOGUE
5863 EPILOGUE_4_ARGS
5864ENDPROC iemAImpl_cvtsi2ss_r32_i32
5865
5866;;
5867; cvtsi2ss instruction - 64-bit variant.
5868;
5869; @param A0 FPU context (FXSTATE or XSAVEAREA).
5870; @param A1 Where to return the MXCSR value.
5871; @param A2 Pointer to the result operand (output).
5872; @param A3 Pointer to the second operand (input).
5873;
5874BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5875 PROLOGUE_4_ARGS
5876 IEMIMPL_SSE_PROLOGUE
5877 SSE_LD_FXSTATE_MXCSR A0
5878
5879 cvtsi2ss xmm0, qword [A3]
5880 movd dword [A2], xmm0
5881
5882 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5883 IEMIMPL_SSE_EPILOGUE
5884 EPILOGUE_4_ARGS
5885ENDPROC iemAImpl_cvtsi2ss_r32_i64
5886
5887
5888;;
5889; cvtsi2sd instruction - 32-bit variant.
5890;
5891; @param A0 FPU context (FXSTATE or XSAVEAREA).
5892; @param A1 Where to return the MXCSR value.
5893; @param A2 Pointer to the result operand (output).
5894; @param A3 Pointer to the second operand (input).
5895;
5896BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5897 PROLOGUE_4_ARGS
5898 IEMIMPL_SSE_PROLOGUE
5899 SSE_LD_FXSTATE_MXCSR A0
5900
5901 cvtsi2sd xmm0, dword [A3]
5902 movq [A2], xmm0
5903
5904 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5905 IEMIMPL_SSE_EPILOGUE
5906 EPILOGUE_4_ARGS
5907ENDPROC iemAImpl_cvtsi2sd_r64_i32
5908
5909;;
5910; cvtsi2sd instruction - 64-bit variant.
5911;
5912; @param A0 FPU context (FXSTATE or XSAVEAREA).
5913; @param A1 Where to return the MXCSR value.
5914; @param A2 Pointer to the result operand (output).
5915; @param A3 Pointer to the second operand (input).
5916;
5917BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5918 PROLOGUE_4_ARGS
5919 IEMIMPL_SSE_PROLOGUE
5920 SSE_LD_FXSTATE_MXCSR A0
5921
5922 cvtsi2sd xmm0, qword [A3]
5923 movq [A2], xmm0
5924
5925 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5926 IEMIMPL_SSE_EPILOGUE
5927 EPILOGUE_4_ARGS
5928ENDPROC iemAImpl_cvtsi2sd_r64_i64
5929
5930
5931;;
5932; Initialize the SSE MXCSR register using the guest value partially to
5933; account for rounding mode.
5934;
5935; @uses 4 bytes of stack to save the original value, T0.
5936; @param 1 Expression giving the address of the MXCSR register of the guest.
5937;
5938%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5939 sub xSP, 4
5940
5941 stmxcsr [xSP]
5942 mov T0_32, [%1]
5943 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5944 or T0_32, X86_MXCSR_XCPT_MASK
5945 sub xSP, 4
5946 mov [xSP], T0_32
5947 ldmxcsr [xSP]
5948 add xSP, 4
5949%endmacro
5950
5951
5952;;
5953; Restores the SSE MXCSR register with the original value.
5954;
5955; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5956; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5957;
5958; @note Restores the stack pointer.
5959;
5960%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5961 sub xSP, 4
5962 stmxcsr [xSP]
5963 mov T0_32, [xSP]
5964 add xSP, 4
5965 ; Merge the status bits into the original MXCSR value.
5966 mov T1_32, [%1]
5967 and T0_32, X86_MXCSR_XCPT_FLAGS
5968 or T0_32, T1_32
5969 mov [%1], T0_32
5970
5971 ldmxcsr [xSP]
5972 add xSP, 4
5973%endmacro
5974
5975
5976;
5977; UCOMISS (SSE)
5978;
5979; @param A0 Pointer to the MXCSR value (input/output).
5980; @param A1 Pointer to the EFLAGS value (input/output).
5981; @param A2 Pointer to the first source operand (aka readonly destination).
5982; @param A3 Pointer to the second source operand.
5983;
5984BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5985 PROLOGUE_4_ARGS
5986 IEMIMPL_SSE_PROLOGUE
5987 SSE_LD_FXSTATE_MXCSR_ONLY A0
5988
5989 movdqu xmm0, [A2]
5990 movdqu xmm1, [A3]
5991 ucomiss xmm0, xmm1
5992 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5993
5994 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5995 IEMIMPL_SSE_EPILOGUE
5996 EPILOGUE_4_ARGS
5997ENDPROC iemAImpl_ucomiss_u128
5998
5999BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6000 PROLOGUE_4_ARGS
6001 IEMIMPL_SSE_PROLOGUE
6002 SSE_LD_FXSTATE_MXCSR_ONLY A0
6003
6004 movdqu xmm0, [A2]
6005 movdqu xmm1, [A3]
6006 vucomiss xmm0, xmm1
6007 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6008
6009 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6010 IEMIMPL_SSE_EPILOGUE
6011 EPILOGUE_4_ARGS
6012ENDPROC iemAImpl_vucomiss_u128
6013
6014
6015;
6016; UCOMISD (SSE)
6017;
6018; @param A0 Pointer to the MXCSR value (input/output).
6019; @param A1 Pointer to the EFLAGS value (input/output).
6020; @param A2 Pointer to the first source operand (aka readonly destination).
6021; @param A3 Pointer to the second source operand.
6022;
6023BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6024 PROLOGUE_4_ARGS
6025 IEMIMPL_SSE_PROLOGUE
6026 SSE_LD_FXSTATE_MXCSR_ONLY A0
6027
6028 movdqu xmm0, [A2]
6029 movdqu xmm1, [A3]
6030 ucomisd xmm0, xmm1
6031 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6032
6033 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6034 IEMIMPL_SSE_EPILOGUE
6035 EPILOGUE_4_ARGS
6036ENDPROC iemAImpl_ucomisd_u128
6037
6038BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6039 PROLOGUE_4_ARGS
6040 IEMIMPL_SSE_PROLOGUE
6041 SSE_LD_FXSTATE_MXCSR_ONLY A0
6042
6043 movdqu xmm0, [A2]
6044 movdqu xmm1, [A3]
6045 vucomisd xmm0, xmm1
6046 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6047
6048 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6049 IEMIMPL_SSE_EPILOGUE
6050 EPILOGUE_4_ARGS
6051ENDPROC iemAImpl_vucomisd_u128
6052
6053;
6054; COMISS (SSE)
6055;
6056; @param A0 Pointer to the MXCSR value (input/output).
6057; @param A1 Pointer to the EFLAGS value (input/output).
6058; @param A2 Pointer to the first source operand (aka readonly destination).
6059; @param A3 Pointer to the second source operand.
6060;
6061BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6062 PROLOGUE_4_ARGS
6063 IEMIMPL_SSE_PROLOGUE
6064 SSE_LD_FXSTATE_MXCSR_ONLY A0
6065
6066 movdqu xmm0, [A2]
6067 movdqu xmm1, [A3]
6068 comiss xmm0, xmm1
6069 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6070
6071 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6072 IEMIMPL_SSE_EPILOGUE
6073 EPILOGUE_4_ARGS
6074ENDPROC iemAImpl_comiss_u128
6075
6076BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6077 PROLOGUE_4_ARGS
6078 IEMIMPL_SSE_PROLOGUE
6079 SSE_LD_FXSTATE_MXCSR_ONLY A0
6080
6081 movdqu xmm0, [A2]
6082 movdqu xmm1, [A3]
6083 vcomiss xmm0, xmm1
6084 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6085
6086 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6087 IEMIMPL_SSE_EPILOGUE
6088 EPILOGUE_4_ARGS
6089ENDPROC iemAImpl_vcomiss_u128
6090
6091
6092;
6093; COMISD (SSE)
6094;
6095; @param A0 Pointer to the MXCSR value (input/output).
6096; @param A1 Pointer to the EFLAGS value (input/output).
6097; @param A2 Pointer to the first source operand (aka readonly destination).
6098; @param A3 Pointer to the second source operand.
6099;
6100BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6101 PROLOGUE_4_ARGS
6102 IEMIMPL_SSE_PROLOGUE
6103 SSE_LD_FXSTATE_MXCSR_ONLY A0
6104
6105 movdqu xmm0, [A2]
6106 movdqu xmm1, [A3]
6107 comisd xmm0, xmm1
6108 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6109
6110 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6111 IEMIMPL_SSE_EPILOGUE
6112 EPILOGUE_4_ARGS
6113ENDPROC iemAImpl_comisd_u128
6114
6115BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6116 PROLOGUE_4_ARGS
6117 IEMIMPL_SSE_PROLOGUE
6118 SSE_LD_FXSTATE_MXCSR_ONLY A0
6119
6120 movdqu xmm0, [A2]
6121 movdqu xmm1, [A3]
6122 vcomisd xmm0, xmm1
6123 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6124
6125 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6126 IEMIMPL_SSE_EPILOGUE
6127 EPILOGUE_4_ARGS
6128ENDPROC iemAImpl_vcomisd_u128
6129
6130
6131;;
6132; Need to move this as well somewhere better?
6133;
6134struc IEMMEDIAF2XMMSRC
6135 .uSrc1 resd 4
6136 .uSrc2 resd 4
6137endstruc
6138
6139
6140;
6141; CMPPS (SSE)
6142;
6143; @param A0 Pointer to the MXCSR value (input/output).
6144; @param A1 Pointer to the first media register size operand (output).
6145; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6146; @param A3 The 8-bit immediate (input).
6147;
6148BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6149 PROLOGUE_4_ARGS
6150 IEMIMPL_SSE_PROLOGUE
6151 SSE_LD_FXSTATE_MXCSR_ONLY A0
6152
6153 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6154 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6155 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6156 lea T1, [.imm0 xWrtRIP]
6157 lea T1, [T1 + T0]
6158 call T1
6159 movdqu [A1], xmm0
6160
6161 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6162 IEMIMPL_SSE_EPILOGUE
6163 EPILOGUE_4_ARGS
6164 %assign bImm 0
6165 %rep 256
6166.imm %+ bImm:
6167 cmpps xmm0, xmm1, bImm
6168 ret
6169 %assign bImm bImm + 1
6170 %endrep
6171.immEnd: ; 256*5 == 0x500
6172dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6173dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6174ENDPROC iemAImpl_cmpps_u128
6175
6176;;
6177; SSE instructions with 8-bit immediates of the form
6178; xxx xmm1, xmm2, imm8.
6179; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6180; register.
6181;
6182; @param 1 The instruction name.
6183;
6184; @param A0 Pointer to the MXCSR value (input/output).
6185; @param A1 Pointer to the first media register size operand (output).
6186; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6187; @param A3 The 8-bit immediate (input).
6188;
6189%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6190BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6191 PROLOGUE_4_ARGS
6192 IEMIMPL_SSE_PROLOGUE
6193 SSE_LD_FXSTATE_MXCSR_ONLY A0
6194
6195 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6196 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6197 lea T1, [.imm0 xWrtRIP]
6198 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6199 lea T1, [T1 + T0*2]
6200 call T1
6201 movdqu [A1], xmm0
6202
6203 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6204 IEMIMPL_SSE_EPILOGUE
6205 EPILOGUE_4_ARGS
6206 %assign bImm 0
6207 %rep 256
6208.imm %+ bImm:
6209 %1 xmm0, xmm1, bImm
6210 ret
6211 %assign bImm bImm + 1
6212 %endrep
6213.immEnd: ; 256*6 == 0x600
6214dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6215dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6216ENDPROC iemAImpl_ %+ %1 %+ _u128
6217%endmacro
6218
6219IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6220IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6221IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6222
6223;;
6224; SSE instructions with 8-bit immediates of the form
6225; xxx xmm1, xmm2, imm8.
6226; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6227; register.
6228;
6229; @param 1 The instruction name.
6230;
6231; @param A0 Pointer to the MXCSR value (input/output).
6232; @param A1 Pointer to the first media register size operand (output).
6233; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6234; @param A3 The 8-bit immediate (input).
6235;
6236%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6237BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6238 PROLOGUE_4_ARGS
6239 IEMIMPL_SSE_PROLOGUE
6240 SSE_LD_FXSTATE_MXCSR_ONLY A0
6241
6242 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6243 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6244 lea T1, [.imm0 xWrtRIP]
6245 lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3
6246 lea T0, [T0*2]
6247 lea T0, [T0 + A3]
6248 lea T1, [T1 + T0]
6249 call T1
6250 movdqu [A1], xmm0
6251
6252 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6253 IEMIMPL_SSE_EPILOGUE
6254 EPILOGUE_4_ARGS
6255 %assign bImm 0
6256 %rep 256
6257.imm %+ bImm:
6258 %1 xmm0, xmm1, bImm
6259 ret
6260 %assign bImm bImm + 1
6261 %endrep
6262.immEnd: ; 256*(6+1) == 0x700
6263dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6264dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6265ENDPROC iemAImpl_ %+ %1 %+ _u128
6266%endmacro
6267
6268IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6269IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6270IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6271IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6272IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6273IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6274
6275
6276;;
6277; SSE instructions of the form
6278; xxx mm, xmm.
6279; and we need to load and save the MXCSR register.
6280;
6281; @param 1 The instruction name.
6282;
6283; @param A0 Pointer to the MXCSR value (input/output).
6284; @param A1 Pointer to the first MMX register sized operand (output).
6285; @param A2 Pointer to the media register sized operand (input).
6286;
6287%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6288BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6289 PROLOGUE_3_ARGS
6290 IEMIMPL_SSE_PROLOGUE
6291 SSE_LD_FXSTATE_MXCSR_ONLY A0
6292
6293 movdqu xmm0, [A2]
6294 %1 mm0, xmm0
6295 movq [A1], mm0
6296
6297 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6298 IEMIMPL_SSE_EPILOGUE
6299 EPILOGUE_3_ARGS
6300ENDPROC iemAImpl_ %+ %1 %+ _u128
6301%endmacro
6302
6303IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6304IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6305
6306;;
6307; SSE instructions of the form
6308; xxx xmm, xmm/m64.
6309; and we need to load and save the MXCSR register.
6310;
6311; @param 1 The instruction name.
6312;
6313; @param A0 Pointer to the MXCSR value (input/output).
6314; @param A1 Pointer to the first media register sized operand (input/output).
6315; @param A2 The 64bit source value from a MMX media register (input)
6316;
6317%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6318BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6319 PROLOGUE_3_ARGS
6320 IEMIMPL_SSE_PROLOGUE
6321 SSE_LD_FXSTATE_MXCSR_ONLY A0
6322
6323 movdqu xmm0, [A1]
6324 movq mm0, A2
6325 %1 xmm0, mm0
6326 movdqu [A1], xmm0
6327
6328 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6329 IEMIMPL_SSE_EPILOGUE
6330 EPILOGUE_3_ARGS
6331ENDPROC iemAImpl_ %+ %1 %+ _u128
6332%endmacro
6333
6334IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6335IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6336
6337;;
6338; SSE instructions of the form
6339; xxx mm, xmm/m64.
6340; and we need to load and save the MXCSR register.
6341;
6342; @param 1 The instruction name.
6343;
6344; @param A0 Pointer to the MXCSR value (input/output).
6345; @param A1 Pointer to the first MMX media register sized operand (output).
6346; @param A2 The 64bit source value (input).
6347;
6348%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6349BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6350 PROLOGUE_3_ARGS
6351 IEMIMPL_SSE_PROLOGUE
6352 SSE_LD_FXSTATE_MXCSR_ONLY A0
6353
6354 movq xmm0, A2
6355 %1 mm0, xmm0
6356 movq [A1], mm0
6357
6358 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6359 IEMIMPL_SSE_EPILOGUE
6360 EPILOGUE_3_ARGS
6361ENDPROC iemAImpl_ %+ %1 %+ _u128
6362%endmacro
6363
6364IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6365IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6366
6367;
6368; All forms of RDRAND and RDSEED
6369;
6370; @param A0 Pointer to the destination operand.
6371; @param A1 Pointer to the EFLAGS value (input/output).
6372;
6373%macro IEMIMPL_RDRAND_RDSEED 3
6374BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6375 PROLOGUE_2_ARGS
6376
6377 %1 %2
6378 mov [A0], %2
6379 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6380
6381 EPILOGUE_2_ARGS
6382ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6383%endmacro
6384
6385IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6386IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6387IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6388IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6389IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6390IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6391
6392
6393;;
6394; sha1rnds4 xmm1, xmm2, imm8.
6395;
6396; @param 1 The instruction name.
6397;
6398; @param A0 Pointer to the first media register size operand (input/output).
6399; @param A1 Pointer to the second source media register size operand (input).
6400; @param A2 The 8-bit immediate
6401;
6402BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6403 PROLOGUE_3_ARGS
6404 IEMIMPL_SSE_PROLOGUE
6405
6406 movdqu xmm0, [A0]
6407 movdqu xmm1, [A1]
6408 lea T1, [.imm0 xWrtRIP]
6409 lea T0, [A2 + A2*2] ; sizeof(insnX+ret) == 6: (A2 * 3) * 2
6410 lea T1, [T1 + T0*2]
6411 call T1
6412 movdqu [A0], xmm0
6413
6414 IEMIMPL_SSE_EPILOGUE
6415 EPILOGUE_3_ARGS
6416 %assign bImm 0
6417 %rep 256
6418.imm %+ bImm:
6419 sha1rnds4 xmm0, xmm1, bImm
6420 ret
6421 %assign bImm bImm + 1
6422 %endrep
6423.immEnd: ; 256*6 == 0x600
6424dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6425dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6426ENDPROC iemAImpl_sha1rnds4_u128
6427
6428
6429;;
6430; sha256rnds2 xmm1, xmm2, <XMM0>.
6431;
6432; @param 1 The instruction name.
6433;
6434; @param A0 Pointer to the first media register size operand (input/output).
6435; @param A1 Pointer to the second source media register size operand (input).
6436; @param A2 Pointer to the implicit XMM0 constants (input).
6437;
6438BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6439 PROLOGUE_3_ARGS
6440 IEMIMPL_SSE_PROLOGUE
6441
6442 movdqu xmm0, [A2]
6443 movdqu xmm1, [A0]
6444 movdqu xmm2, [A1]
6445 sha256rnds2 xmm1, xmm2
6446 movdqu [A0], xmm1
6447
6448 IEMIMPL_SSE_EPILOGUE
6449 EPILOGUE_3_ARGS
6450ENDPROC iemAImpl_sha256rnds2_u128
6451
6452
6453;
6454; 32-bit forms of ADCX and ADOX
6455;
6456; @param A0 Pointer to the destination operand (input/output).
6457; @param A1 Pointer to the EFLAGS value (input/output).
6458; @param A2 32-bit source operand 1 (input).
6459;
6460%macro IEMIMPL_ADX_32 2
6461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6462 PROLOGUE_4_ARGS
6463
6464 IEM_LOAD_FLAGS A1, %2, 0
6465 %1 A2_32, [A0]
6466 mov [A0], A2_32
6467 IEM_SAVE_FLAGS A1, %2, 0
6468
6469 EPILOGUE_4_ARGS
6470ENDPROC iemAImpl_ %+ %1 %+ _u32
6471%endmacro
6472
6473;
6474; 64-bit forms of ADCX and ADOX
6475;
6476; @param A0 Pointer to the destination operand (input/output).
6477; @param A1 Pointer to the EFLAGS value (input/output).
6478; @param A2 64-bit source operand 1 (input).
6479;
6480%macro IEMIMPL_ADX_64 2
6481BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6482 PROLOGUE_4_ARGS
6483
6484 IEM_LOAD_FLAGS A1, %2, 0
6485 %1 A2, [A0]
6486 mov [A0], A2
6487 IEM_SAVE_FLAGS A1, %2, 0
6488
6489 EPILOGUE_4_ARGS
6490ENDPROC iemAImpl_ %+ %1 %+ _u64
6491%endmacro
6492
6493IEMIMPL_ADX_32 adcx, X86_EFL_CF
6494IEMIMPL_ADX_64 adcx, X86_EFL_CF
6495
6496IEMIMPL_ADX_32 adox, X86_EFL_OF
6497IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette