VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 105354

Last change on this file since 105354 was 105354, checked in by vboxsync, 5 months ago

VMM/IEM: Implement vcvttpd2dq and vcvtpd2dq instruction emulations, some fixes for vcvtpd2ps, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 219.6 KB
Line 
1; $Id: IEMAllAImpl.asm 105354 2024-07-16 12:37:10Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A1,
1086; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1087; are returned in EAX.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092; @param 4 The zeroed flags.
1093;
1094%macro IEMIMPL_VEX_BIN_OP_2 4
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1096 PROLOGUE_4_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1098 mov T0_32, [A1]
1099 %1 T0_32, A2_32
1100 mov [A1], T0_32
1101 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1102 EPILOGUE_4_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u32
1104
1105 %ifdef RT_ARCH_AMD64
1106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1107 PROLOGUE_4_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1109 mov T0, [A1]
1110 %1 T0, A2
1111 mov [A1], T0
1112 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1113 EPILOGUE_4_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u64
1115 %endif ; RT_ARCH_AMD64
1116%endmacro
1117
1118; instr, modified-flags, undefined-flags zeroed-flags
1119IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122
1123
1124;;
1125; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1126;
1127; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1128; where the 64-bit accesses requires hand coding.
1129;
1130; All the functions takes a pointer to the destination memory operand in A0,
1131; the first source register operand in A1, the second source register operand
1132; in A2 and a pointer to eflags in A3.
1133;
1134; @param 1 The instruction mnemonic.
1135; @param 2 Fallback instruction if applicable.
1136; @param 3 Whether to emit fallback or not.
1137;
1138%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140 PROLOGUE_3_ARGS
1141 %1 T0_32, A1_32, A2_32
1142 mov [A0], T0_32
1143 EPILOGUE_3_ARGS
1144ENDPROC iemAImpl_ %+ %1 %+ _u32
1145
1146 %if %3
1147BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1148 PROLOGUE_3_ARGS
1149 %ifdef ASM_CALL64_GCC
1150 mov cl, A2_8
1151 %2 A1_32, cl
1152 mov [A0], A1_32
1153 %else
1154 xchg A2, A0
1155 %2 A1_32, cl
1156 mov [A2], A1_32
1157 %endif
1158 EPILOGUE_3_ARGS
1159ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1160 %endif
1161
1162 %ifdef RT_ARCH_AMD64
1163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1164 PROLOGUE_3_ARGS
1165 %1 T0, A1, A2
1166 mov [A0], T0
1167 EPILOGUE_3_ARGS
1168ENDPROC iemAImpl_ %+ %1 %+ _u64
1169
1170 %if %3
1171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1172 PROLOGUE_3_ARGS
1173 %ifdef ASM_CALL64_GCC
1174 mov cl, A2_8
1175 %2 A1, cl
1176 mov [A0], A1_32
1177 %else
1178 xchg A2, A0
1179 %2 A1, cl
1180 mov [A2], A1_32
1181 %endif
1182 mov [A0], A1
1183 EPILOGUE_3_ARGS
1184ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1185 %endif
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188
1189; instr, fallback instr, emit fallback
1190IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1194IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1195
1196
1197;
1198; RORX uses a immediate byte for the shift count, so we only do
1199; fallback implementation of that one.
1200;
1201BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1202 PROLOGUE_3_ARGS
1203 %ifdef ASM_CALL64_GCC
1204 mov cl, A2_8
1205 ror A1_32, cl
1206 mov [A0], A1_32
1207 %else
1208 xchg A2, A0
1209 ror A1_32, cl
1210 mov [A2], A1_32
1211 %endif
1212 EPILOGUE_3_ARGS
1213ENDPROC iemAImpl_rorx_u32
1214
1215 %ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1217 PROLOGUE_3_ARGS
1218 %ifdef ASM_CALL64_GCC
1219 mov cl, A2_8
1220 ror A1, cl
1221 mov [A0], A1
1222 %else
1223 xchg A2, A0
1224 ror A1, cl
1225 mov [A2], A1
1226 %endif
1227 EPILOGUE_3_ARGS
1228ENDPROC iemAImpl_rorx_u64
1229 %endif ; RT_ARCH_AMD64
1230
1231
1232;
1233; MULX
1234;
1235BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1236 PROLOGUE_4_ARGS
1237%ifdef ASM_CALL64_GCC
1238 ; A2_32 is EDX - prefect
1239 mulx T0_32, T1_32, A3_32
1240 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1241 mov [A0], T0_32
1242%else
1243 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1244 xchg A1, A2
1245 mulx T0_32, T1_32, A3_32
1246 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1247 mov [A0], T0_32
1248%endif
1249 EPILOGUE_4_ARGS
1250ENDPROC iemAImpl_mulx_u32
1251
1252
1253BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1254 PROLOGUE_4_ARGS
1255%ifdef ASM_CALL64_GCC
1256 ; A2_32 is EDX, T0_32 is EAX
1257 mov eax, A3_32
1258 mul A2_32
1259 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1260 mov [A0], edx
1261%else
1262 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1263 xchg A1, A2
1264 mov eax, A3_32
1265 mul A2_32
1266 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1267 mov [A0], edx
1268%endif
1269 EPILOGUE_4_ARGS
1270ENDPROC iemAImpl_mulx_u32_fallback
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1274 PROLOGUE_4_ARGS
1275%ifdef ASM_CALL64_GCC
1276 ; A2 is RDX - prefect
1277 mulx T0, T1, A3
1278 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1279 mov [A0], T0
1280%else
1281 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1282 xchg A1, A2
1283 mulx T0, T1, A3
1284 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1285 mov [A0], T0
1286%endif
1287 EPILOGUE_4_ARGS
1288ENDPROC iemAImpl_mulx_u64
1289
1290
1291BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1292 PROLOGUE_4_ARGS
1293%ifdef ASM_CALL64_GCC
1294 ; A2 is RDX, T0 is RAX
1295 mov rax, A3
1296 mul A2
1297 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1298 mov [A0], rdx
1299%else
1300 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1301 xchg A1, A2
1302 mov rax, A3
1303 mul A2
1304 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1305 mov [A0], rdx
1306%endif
1307 EPILOGUE_4_ARGS
1308ENDPROC iemAImpl_mulx_u64_fallback
1309
1310%endif
1311
1312
1313;;
1314; Macro for implementing a bit operator.
1315;
1316; This will generate code for the 16, 32 and 64 bit accesses with locked
1317; variants, except on 32-bit system where the 64-bit accesses requires hand
1318; coding.
1319;
1320; All the functions takes a pointer to the destination memory operand in A1,
1321; the source register operand in A2 and incoming eflags in A0.
1322;
1323; @param 1 The instruction mnemonic.
1324; @param 2 Non-zero if there should be a locked version.
1325; @param 3 The modified flags.
1326; @param 4 The undefined flags.
1327;
1328%macro IEMIMPL_BIT_OP 4
1329BEGINCODE
1330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1333 %1 word [A1], A2_16
1334 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_ %+ %1 %+ _u16
1337
1338BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1341 %1 dword [A1], A2_32
1342 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_ %+ %1 %+ _u32
1345
1346 %ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1350 %1 qword [A1], A2
1351 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1352 EPILOGUE_3_ARGS_EX 8
1353ENDPROC iemAImpl_ %+ %1 %+ _u64
1354 %endif ; RT_ARCH_AMD64
1355
1356 %if %2 != 0 ; locked versions requested?
1357
1358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1359 PROLOGUE_3_ARGS
1360 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1361 lock %1 word [A1], A2_16
1362 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1365
1366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1367 PROLOGUE_3_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1369 lock %1 dword [A1], A2_32
1370 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1371 EPILOGUE_3_ARGS
1372ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1373
1374 %ifdef RT_ARCH_AMD64
1375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1376 PROLOGUE_3_ARGS
1377 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1378 lock %1 qword [A1], A2
1379 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1380 EPILOGUE_3_ARGS_EX 8
1381ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1382 %endif ; RT_ARCH_AMD64
1383 %endif ; locked
1384%endmacro
1385
1386; Undefined flags are passed thru here by the intel and amd CPUs we have.
1387; modified efl, undefined eflags
1388IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392
1393;;
1394; Macro for implementing a bit search operator.
1395;
1396; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1397; system where the 64-bit accesses requires hand coding.
1398;
1399; All the functions takes a pointer to the destination memory operand in A1,
1400; the source register operand in A2 and the incoming eflags in A0.
1401;
1402; In the ZF case the destination register is 'undefined', however it seems that
1403; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1404; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1405; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1406; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1407;
1408; Intel: Clear all and calculate PF in addition to ZF.
1409; AMD: Passthru all flags other than ZF.
1410;
1411; @param 1 The instruction mnemonic.
1412; @param 2 The modified flags.
1413; @param 3 The undefined flags.
1414; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1415;
1416%macro IEMIMPL_BIT_OP2 4
1417BEGINCODE
1418; 16-bit
1419
1420BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1421 PROLOGUE_3_ARGS
1422 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1423 %1 T0_16, A2_16
1424%if %4 != 0
1425 jz .unchanged_dst
1426%endif
1427 mov [A1], T0_16
1428.unchanged_dst:
1429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1430 EPILOGUE_3_ARGS
1431ENDPROC iemAImpl_ %+ %1 %+ _u16
1432
1433;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1434;bad; PROLOGUE_3_ARGS
1435;bad; %1 T1_16, A1_16
1436;bad; jz .unchanged_dst
1437;bad; mov [A0], T1_16
1438;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1439;bad; EPILOGUE_3_ARGS
1440;bad;.unchanged_dst:
1441;bad;%if %4 != 0
1442;bad; mov [A0], T1_16
1443;bad;%endif
1444;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1445;bad; EPILOGUE_3_ARGS
1446;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1447;bad;
1448;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1449;bad; PROLOGUE_3_ARGS
1450;bad; %1 T0_16, A1_16
1451;bad;%if %4 != 0
1452;bad; jz .unchanged_dst
1453;bad;%endif
1454;bad; mov [A0], T0_16
1455;bad;.unchanged_dst:
1456;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1457;bad; EPILOGUE_3_ARGS
1458;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1459
1460; 32-bit
1461
1462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1463 PROLOGUE_3_ARGS
1464 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1465 %1 T0_32, A2_32
1466%if %4 != 0
1467 jz .unchanged_dst
1468%endif
1469 mov [A1], T0_32
1470.unchanged_dst:
1471 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1472 EPILOGUE_3_ARGS
1473ENDPROC iemAImpl_ %+ %1 %+ _u32
1474
1475;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1476;bad; PROLOGUE_3_ARGS
1477;bad; %1 T1_32, A1_32
1478;bad;%if %4 != 0
1479;bad; jz .unchanged_dst
1480;bad;%endif
1481;bad; mov [A0], T1_32
1482;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1483;bad; EPILOGUE_3_ARGS
1484;bad;.unchanged_dst:
1485;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1486;bad; EPILOGUE_3_ARGS
1487;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1488;bad;
1489;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1490;bad; PROLOGUE_3_ARGS
1491;bad; %1 T0_32, A1_32
1492;bad;%if %4 != 0
1493;bad; jz .unchanged_dst
1494;bad;%endif
1495;bad; mov [A0], T0_32
1496;bad;.unchanged_dst:
1497;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1498;bad; EPILOGUE_3_ARGS
1499;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1500
1501
1502 %ifdef RT_ARCH_AMD64
1503; 64-bit
1504
1505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1506 PROLOGUE_3_ARGS
1507 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1508 %1 T0, A2
1509%if %4 != 0
1510 jz .unchanged_dst
1511%endif
1512 mov [A1], T0
1513.unchanged_dst:
1514 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1515 EPILOGUE_3_ARGS_EX 8
1516ENDPROC iemAImpl_ %+ %1 %+ _u64
1517
1518;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1519;bad; PROLOGUE_3_ARGS
1520;bad; %1 T1, A1
1521;bad;%if %4 != 0
1522;bad; jz .unchanged_dst
1523;bad;%endif
1524;bad; mov [A0], T1
1525;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1526;bad; EPILOGUE_3_ARGS
1527;bad;.unchanged_dst:
1528;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1529;bad; EPILOGUE_3_ARGS
1530;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1531;bad;
1532;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1533;bad; PROLOGUE_3_ARGS
1534;bad; %1 T0, A1
1535;bad;%if %4 != 0
1536;bad; jz .unchanged_dst
1537;bad;%endif
1538;bad; mov [A0], T0
1539;bad;.unchanged_dst:
1540;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1541;bad; EPILOGUE_3_ARGS_EX 8
1542;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1543
1544 %endif ; RT_ARCH_AMD64
1545%endmacro
1546
1547IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551
1552
1553;;
1554; Macro for implementing POPCNT.
1555;
1556; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1557; system where the 64-bit accesses requires hand coding.
1558;
1559; All the functions takes a pointer to the destination memory operand in A1,
1560; the source register operand in A2 and eflags in A0.
1561;
1562; ASSUMES Intel and AMD set EFLAGS the same way.
1563;
1564; ASSUMES the instruction does not support memory destination.
1565;
1566; @param 1 The instruction mnemonic.
1567; @param 2 The modified flags.
1568; @param 3 The undefined flags.
1569; @param 4 The zeroed flags.
1570;
1571%macro IEMIMPL_BIT_OP3 4
1572BEGINCODE
1573BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1574 PROLOGUE_3_ARGS
1575 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1576 %1 T0_16, A2_16
1577 mov [A1], T0_16
1578 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1579 EPILOGUE_3_ARGS
1580ENDPROC iemAImpl_ %+ %1 %+ _u16
1581
1582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1583 PROLOGUE_3_ARGS
1584 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1585 %1 T0_32, A2_32
1586 mov [A1], T0_32
1587 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1588 EPILOGUE_3_ARGS
1589ENDPROC iemAImpl_ %+ %1 %+ _u32
1590
1591 %ifdef RT_ARCH_AMD64
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1595 %1 T0, A2
1596 mov [A1], T0
1597 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1598 EPILOGUE_3_ARGS_EX 8
1599ENDPROC iemAImpl_ %+ %1 %+ _u64
1600 %endif ; RT_ARCH_AMD64
1601%endmacro
1602IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1603
1604
1605;
1606; IMUL is also a similar but yet different case (no lock, no mem dst).
1607; The rDX:rAX variant of imul is handled together with mul further down.
1608;
1609BEGINCODE
1610; @param 1 EFLAGS that are modified.
1611; @param 2 Undefined EFLAGS.
1612; @param 3 Function suffix.
1613; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1614; 2 for AMD (set AF, clear PF, ZF and SF).
1615%macro IEMIMPL_IMUL_TWO 4
1616BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1617 PROLOGUE_3_ARGS
1618 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1619 imul A2_16, word [A1]
1620 mov [A1], A2_16
1621 %if %4 != 1
1622 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1623 %else
1624 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1625 %endif
1626 EPILOGUE_3_ARGS
1627ENDPROC iemAImpl_imul_two_u16 %+ %3
1628
1629BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1630 PROLOGUE_3_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1632 imul A2_32, dword [A1]
1633 mov [A1], A2_32
1634 %if %4 != 1
1635 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1636 %else
1637 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1638 %endif
1639 EPILOGUE_3_ARGS
1640ENDPROC iemAImpl_imul_two_u32 %+ %3
1641
1642 %ifdef RT_ARCH_AMD64
1643BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1644 PROLOGUE_3_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1646 imul A2, qword [A1]
1647 mov [A1], A2
1648 %if %4 != 1
1649 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1650 %else
1651 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1652 %endif
1653 EPILOGUE_3_ARGS_EX 8
1654ENDPROC iemAImpl_imul_two_u64 %+ %3
1655 %endif ; RT_ARCH_AMD64
1656%endmacro
1657; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1658; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1659; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1663
1664
1665;
1666; XCHG for memory operands. This implies locking. No flag changes.
1667;
1668; Each function takes two arguments, first the pointer to the memory,
1669; then the pointer to the register. They all return void.
1670;
1671BEGINCODE
1672BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1673 PROLOGUE_2_ARGS
1674 mov T0_8, [A1]
1675 xchg [A0], T0_8
1676 mov [A1], T0_8
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_xchg_u8_locked
1679
1680BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 mov T0_16, [A1]
1683 xchg [A0], T0_16
1684 mov [A1], T0_16
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_xchg_u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1689 PROLOGUE_2_ARGS
1690 mov T0_32, [A1]
1691 xchg [A0], T0_32
1692 mov [A1], T0_32
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_xchg_u32_locked
1695
1696%ifdef RT_ARCH_AMD64
1697BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1698 PROLOGUE_2_ARGS
1699 mov T0, [A1]
1700 xchg [A0], T0
1701 mov [A1], T0
1702 EPILOGUE_2_ARGS
1703ENDPROC iemAImpl_xchg_u64_locked
1704%endif
1705
1706; Unlocked variants for fDisregardLock mode.
1707
1708BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1709 PROLOGUE_2_ARGS
1710 mov T0_8, [A1]
1711 mov T1_8, [A0]
1712 mov [A0], T0_8
1713 mov [A1], T1_8
1714 EPILOGUE_2_ARGS
1715ENDPROC iemAImpl_xchg_u8_unlocked
1716
1717BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1718 PROLOGUE_2_ARGS
1719 mov T0_16, [A1]
1720 mov T1_16, [A0]
1721 mov [A0], T0_16
1722 mov [A1], T1_16
1723 EPILOGUE_2_ARGS
1724ENDPROC iemAImpl_xchg_u16_unlocked
1725
1726BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1727 PROLOGUE_2_ARGS
1728 mov T0_32, [A1]
1729 mov T1_32, [A0]
1730 mov [A0], T0_32
1731 mov [A1], T1_32
1732 EPILOGUE_2_ARGS
1733ENDPROC iemAImpl_xchg_u32_unlocked
1734
1735%ifdef RT_ARCH_AMD64
1736BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1737 PROLOGUE_2_ARGS
1738 mov T0, [A1]
1739 mov T1, [A0]
1740 mov [A0], T0
1741 mov [A1], T1
1742 EPILOGUE_2_ARGS
1743ENDPROC iemAImpl_xchg_u64_unlocked
1744%endif
1745
1746
1747;
1748; XADD for memory operands.
1749;
1750; Each function takes three arguments, first the pointer to the
1751; memory/register, then the pointer to the register, and finally a pointer to
1752; eflags. They all return void.
1753;
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1756 PROLOGUE_3_ARGS
1757 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1758 mov T0_8, [A1]
1759 xadd [A0], T0_8
1760 mov [A1], T0_8
1761 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1762 EPILOGUE_3_ARGS
1763ENDPROC iemAImpl_xadd_u8
1764
1765BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1766 PROLOGUE_3_ARGS
1767 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1768 mov T0_16, [A1]
1769 xadd [A0], T0_16
1770 mov [A1], T0_16
1771 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_xadd_u16
1774
1775BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1778 mov T0_32, [A1]
1779 xadd [A0], T0_32
1780 mov [A1], T0_32
1781 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_xadd_u32
1784
1785%ifdef RT_ARCH_AMD64
1786BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1787 PROLOGUE_3_ARGS
1788 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1789 mov T0, [A1]
1790 xadd [A0], T0
1791 mov [A1], T0
1792 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1793 EPILOGUE_3_ARGS
1794ENDPROC iemAImpl_xadd_u64
1795%endif ; RT_ARCH_AMD64
1796
1797BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1798 PROLOGUE_3_ARGS
1799 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1800 mov T0_8, [A1]
1801 lock xadd [A0], T0_8
1802 mov [A1], T0_8
1803 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1804 EPILOGUE_3_ARGS
1805ENDPROC iemAImpl_xadd_u8_locked
1806
1807BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1808 PROLOGUE_3_ARGS
1809 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1810 mov T0_16, [A1]
1811 lock xadd [A0], T0_16
1812 mov [A1], T0_16
1813 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_xadd_u16_locked
1816
1817BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1820 mov T0_32, [A1]
1821 lock xadd [A0], T0_32
1822 mov [A1], T0_32
1823 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1824 EPILOGUE_3_ARGS
1825ENDPROC iemAImpl_xadd_u32_locked
1826
1827%ifdef RT_ARCH_AMD64
1828BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1829 PROLOGUE_3_ARGS
1830 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1831 mov T0, [A1]
1832 lock xadd [A0], T0
1833 mov [A1], T0
1834 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1835 EPILOGUE_3_ARGS
1836ENDPROC iemAImpl_xadd_u64_locked
1837%endif ; RT_ARCH_AMD64
1838
1839
1840;
1841; CMPXCHG8B.
1842;
1843; These are tricky register wise, so the code is duplicated for each calling
1844; convention.
1845;
1846; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1847;
1848; C-proto:
1849; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1850; uint32_t *pEFlags));
1851;
1852; Note! Identical to iemAImpl_cmpxchg16b.
1853;
1854BEGINCODE
1855BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1856%ifdef RT_ARCH_AMD64
1857 %ifdef ASM_CALL64_MSC
1858 push rbx
1859
1860 mov r11, rdx ; pu64EaxEdx (is also T1)
1861 mov r10, rcx ; pu64Dst
1862
1863 mov ebx, [r8]
1864 mov ecx, [r8 + 4]
1865 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1866 mov eax, [r11]
1867 mov edx, [r11 + 4]
1868
1869 cmpxchg8b [r10]
1870
1871 mov [r11], eax
1872 mov [r11 + 4], edx
1873 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1874
1875 pop rbx
1876 ret
1877 %else
1878 push rbx
1879
1880 mov r10, rcx ; pEFlags
1881 mov r11, rdx ; pu64EbxEcx (is also T1)
1882
1883 mov ebx, [r11]
1884 mov ecx, [r11 + 4]
1885 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1886 mov eax, [rsi]
1887 mov edx, [rsi + 4]
1888
1889 cmpxchg8b [rdi]
1890
1891 mov [rsi], eax
1892 mov [rsi + 4], edx
1893 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1894
1895 pop rbx
1896 ret
1897
1898 %endif
1899%else
1900 push esi
1901 push edi
1902 push ebx
1903 push ebp
1904
1905 mov edi, ecx ; pu64Dst
1906 mov esi, edx ; pu64EaxEdx
1907 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1908 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1909
1910 mov ebx, [ecx]
1911 mov ecx, [ecx + 4]
1912 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1913 mov eax, [esi]
1914 mov edx, [esi + 4]
1915
1916 cmpxchg8b [edi]
1917
1918 mov [esi], eax
1919 mov [esi + 4], edx
1920 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1921
1922 pop ebp
1923 pop ebx
1924 pop edi
1925 pop esi
1926 ret 8
1927%endif
1928ENDPROC iemAImpl_cmpxchg8b
1929
1930BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1931%ifdef RT_ARCH_AMD64
1932 %ifdef ASM_CALL64_MSC
1933 push rbx
1934
1935 mov r11, rdx ; pu64EaxEdx (is also T1)
1936 mov r10, rcx ; pu64Dst
1937
1938 mov ebx, [r8]
1939 mov ecx, [r8 + 4]
1940 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1941 mov eax, [r11]
1942 mov edx, [r11 + 4]
1943
1944 lock cmpxchg8b [r10]
1945
1946 mov [r11], eax
1947 mov [r11 + 4], edx
1948 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1949
1950 pop rbx
1951 ret
1952 %else
1953 push rbx
1954
1955 mov r10, rcx ; pEFlags
1956 mov r11, rdx ; pu64EbxEcx (is also T1)
1957
1958 mov ebx, [r11]
1959 mov ecx, [r11 + 4]
1960 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1961 mov eax, [rsi]
1962 mov edx, [rsi + 4]
1963
1964 lock cmpxchg8b [rdi]
1965
1966 mov [rsi], eax
1967 mov [rsi + 4], edx
1968 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1969
1970 pop rbx
1971 ret
1972
1973 %endif
1974%else
1975 push esi
1976 push edi
1977 push ebx
1978 push ebp
1979
1980 mov edi, ecx ; pu64Dst
1981 mov esi, edx ; pu64EaxEdx
1982 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1983 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1984
1985 mov ebx, [ecx]
1986 mov ecx, [ecx + 4]
1987 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1988 mov eax, [esi]
1989 mov edx, [esi + 4]
1990
1991 lock cmpxchg8b [edi]
1992
1993 mov [esi], eax
1994 mov [esi + 4], edx
1995 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1996
1997 pop ebp
1998 pop ebx
1999 pop edi
2000 pop esi
2001 ret 8
2002%endif
2003ENDPROC iemAImpl_cmpxchg8b_locked
2004
2005%ifdef RT_ARCH_AMD64
2006
2007;
2008; CMPXCHG16B.
2009;
2010; These are tricky register wise, so the code is duplicated for each calling
2011; convention.
2012;
2013; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2014;
2015; C-proto:
2016; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2017; uint32_t *pEFlags));
2018;
2019; Note! Identical to iemAImpl_cmpxchg8b.
2020;
2021BEGINCODE
2022BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2023 %ifdef ASM_CALL64_MSC
2024 push rbx
2025
2026 mov r11, rdx ; pu64RaxRdx (is also T1)
2027 mov r10, rcx ; pu64Dst
2028
2029 mov rbx, [r8]
2030 mov rcx, [r8 + 8]
2031 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2032 mov rax, [r11]
2033 mov rdx, [r11 + 8]
2034
2035 cmpxchg16b [r10]
2036
2037 mov [r11], rax
2038 mov [r11 + 8], rdx
2039 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2040
2041 pop rbx
2042 ret
2043 %else
2044 push rbx
2045
2046 mov r10, rcx ; pEFlags
2047 mov r11, rdx ; pu64RbxRcx (is also T1)
2048
2049 mov rbx, [r11]
2050 mov rcx, [r11 + 8]
2051 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2052 mov rax, [rsi]
2053 mov rdx, [rsi + 8]
2054
2055 cmpxchg16b [rdi]
2056
2057 mov [rsi], rax
2058 mov [rsi + 8], rdx
2059 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2060
2061 pop rbx
2062 ret
2063
2064 %endif
2065ENDPROC iemAImpl_cmpxchg16b
2066
2067BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2068 %ifdef ASM_CALL64_MSC
2069 push rbx
2070
2071 mov r11, rdx ; pu64RaxRdx (is also T1)
2072 mov r10, rcx ; pu64Dst
2073
2074 mov rbx, [r8]
2075 mov rcx, [r8 + 8]
2076 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2077 mov rax, [r11]
2078 mov rdx, [r11 + 8]
2079
2080 lock cmpxchg16b [r10]
2081
2082 mov [r11], rax
2083 mov [r11 + 8], rdx
2084 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2085
2086 pop rbx
2087 ret
2088 %else
2089 push rbx
2090
2091 mov r10, rcx ; pEFlags
2092 mov r11, rdx ; pu64RbxRcx (is also T1)
2093
2094 mov rbx, [r11]
2095 mov rcx, [r11 + 8]
2096 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2097 mov rax, [rsi]
2098 mov rdx, [rsi + 8]
2099
2100 lock cmpxchg16b [rdi]
2101
2102 mov [rsi], rax
2103 mov [rsi + 8], rdx
2104 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2105
2106 pop rbx
2107 ret
2108
2109 %endif
2110ENDPROC iemAImpl_cmpxchg16b_locked
2111
2112%endif ; RT_ARCH_AMD64
2113
2114
2115;
2116; CMPXCHG.
2117;
2118; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2119;
2120; C-proto:
2121; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2122;
2123BEGINCODE
2124%macro IEMIMPL_CMPXCHG 2
2125BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2128 mov al, [A1]
2129 %1 cmpxchg [A0], A2_8
2130 mov [A1], al
2131 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2132 EPILOGUE_4_ARGS
2133ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2134
2135BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2136 PROLOGUE_4_ARGS
2137 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2138 mov ax, [A1]
2139 %1 cmpxchg [A0], A2_16
2140 mov [A1], ax
2141 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2144
2145BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2146 PROLOGUE_4_ARGS
2147 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2148 mov eax, [A1]
2149 %1 cmpxchg [A0], A2_32
2150 mov [A1], eax
2151 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2152 EPILOGUE_4_ARGS
2153ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2154
2155BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2156%ifdef RT_ARCH_AMD64
2157 PROLOGUE_4_ARGS
2158 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2159 mov rax, [A1]
2160 %1 cmpxchg [A0], A2
2161 mov [A1], rax
2162 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2163 EPILOGUE_4_ARGS
2164%else
2165 ;
2166 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2167 ;
2168 push esi
2169 push edi
2170 push ebx
2171 push ebp
2172
2173 mov edi, ecx ; pu64Dst
2174 mov esi, edx ; pu64Rax
2175 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2176 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2177
2178 mov ebx, [ecx]
2179 mov ecx, [ecx + 4]
2180 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2181 mov eax, [esi]
2182 mov edx, [esi + 4]
2183
2184 lock cmpxchg8b [edi]
2185
2186 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2187 jz .cmpxchg8b_not_equal
2188;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2189 cmp eax, eax ; just set the other flags.
2190.store:
2191 mov [esi], eax
2192 mov [esi + 4], edx
2193 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2194
2195 pop ebp
2196 pop ebx
2197 pop edi
2198 pop esi
2199 ret 8
2200
2201.cmpxchg8b_not_equal:
2202 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2203 jne .store
2204 cmp [esi], eax
2205 jmp .store
2206
2207%endif
2208ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2209%endmacro ; IEMIMPL_CMPXCHG
2210
2211IEMIMPL_CMPXCHG , ,
2212IEMIMPL_CMPXCHG lock, _locked
2213
2214
2215
2216;;
2217; Macro for implementing a unary operator.
2218;
2219; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2220; variants, except on 32-bit system where the 64-bit accesses requires hand
2221; coding.
2222;
2223; All the functions takes a pointer to the destination memory operand in A0,
2224; the source register operand in A1 and a pointer to eflags in A2.
2225;
2226; @param 1 The instruction mnemonic.
2227; @param 2 The modified flags.
2228; @param 3 The undefined flags.
2229;
2230%macro IEMIMPL_UNARY_OP 3
2231BEGINCODE
2232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2233 PROLOGUE_2_ARGS
2234 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2235 %1 byte [A0]
2236 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2237 EPILOGUE_2_ARGS
2238ENDPROC iemAImpl_ %+ %1 %+ _u8
2239
2240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2241 PROLOGUE_2_ARGS
2242 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2243 lock %1 byte [A0]
2244 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2245 EPILOGUE_2_ARGS
2246ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2247
2248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2249 PROLOGUE_2_ARGS
2250 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2251 %1 word [A0]
2252 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2253 EPILOGUE_2_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u16
2255
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2257 PROLOGUE_2_ARGS
2258 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2259 lock %1 word [A0]
2260 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2261 EPILOGUE_2_ARGS
2262ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2263
2264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2265 PROLOGUE_2_ARGS
2266 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2267 %1 dword [A0]
2268 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2269 EPILOGUE_2_ARGS
2270ENDPROC iemAImpl_ %+ %1 %+ _u32
2271
2272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2273 PROLOGUE_2_ARGS
2274 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2275 lock %1 dword [A0]
2276 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2277 EPILOGUE_2_ARGS
2278ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2279
2280 %ifdef RT_ARCH_AMD64
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2282 PROLOGUE_2_ARGS
2283 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2284 %1 qword [A0]
2285 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2286 EPILOGUE_2_ARGS
2287ENDPROC iemAImpl_ %+ %1 %+ _u64
2288
2289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2290 PROLOGUE_2_ARGS
2291 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2292 lock %1 qword [A0]
2293 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2294 EPILOGUE_2_ARGS
2295ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2296 %endif ; RT_ARCH_AMD64
2297
2298%endmacro
2299
2300IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2303IEMIMPL_UNARY_OP not, 0, 0
2304
2305
2306;
2307; BSWAP. No flag changes.
2308;
2309; Each function takes one argument, pointer to the value to bswap
2310; (input/output). They all return void.
2311;
2312BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2313 PROLOGUE_1_ARGS
2314 mov T0_32, [A0] ; just in case any of the upper bits are used.
2315 db 66h
2316 bswap T0_32
2317 mov [A0], T0_32
2318 EPILOGUE_1_ARGS
2319ENDPROC iemAImpl_bswap_u16
2320
2321BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2322 PROLOGUE_1_ARGS
2323 mov T0_32, [A0]
2324 bswap T0_32
2325 mov [A0], T0_32
2326 EPILOGUE_1_ARGS
2327ENDPROC iemAImpl_bswap_u32
2328
2329BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2330%ifdef RT_ARCH_AMD64
2331 PROLOGUE_1_ARGS
2332 mov T0, [A0]
2333 bswap T0
2334 mov [A0], T0
2335 EPILOGUE_1_ARGS
2336%else
2337 PROLOGUE_1_ARGS
2338 mov T0, [A0]
2339 mov T1, [A0 + 4]
2340 bswap T0
2341 bswap T1
2342 mov [A0 + 4], T0
2343 mov [A0], T1
2344 EPILOGUE_1_ARGS
2345%endif
2346ENDPROC iemAImpl_bswap_u64
2347
2348
2349;;
2350; Macro for implementing a shift operation.
2351;
2352; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2353; 32-bit system where the 64-bit accesses requires hand coding.
2354;
2355; All the functions takes a pointer to the destination memory operand in A0,
2356; the shift count in A1 and a pointer to eflags in A2.
2357;
2358; @param 1 The instruction mnemonic.
2359; @param 2 The modified flags.
2360; @param 3 The undefined flags.
2361; @param 4 Force load flags.
2362;
2363; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2364; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2365; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2366; so we have to switch it around with the shift count parameter registers.
2367;
2368; @note the _intel and _amd variants are implemented in C.
2369;
2370%macro IEMIMPL_SHIFT_OP 4
2371BEGINCODE
2372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2373 PROLOGUE_3_ARGS
2374 %ifdef ASM_CALL64_GCC
2375 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2376 mov cl, A2_8
2377 %1 byte [A1], cl
2378 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2379 %else
2380 xchg A2, A0
2381 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2382 %1 byte [A1], cl
2383 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2384 %endif
2385.zero_shift:
2386 EPILOGUE_3_ARGS
2387ENDPROC iemAImpl_ %+ %1 %+ _u8
2388
2389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2390 PROLOGUE_3_ARGS
2391 %ifdef ASM_CALL64_GCC
2392 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2393 mov cl, A2_8
2394 %1 word [A1], cl
2395 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2396 %else
2397 xchg A2, A0
2398 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2399 %1 word [A1], cl
2400 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2401 %endif
2402 EPILOGUE_3_ARGS
2403ENDPROC iemAImpl_ %+ %1 %+ _u16
2404
2405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2406 PROLOGUE_3_ARGS
2407 %ifdef ASM_CALL64_GCC
2408 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2409 mov cl, A2_8
2410 %1 dword [A1], cl
2411 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2412 %else
2413 xchg A2, A0
2414 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2415 %1 dword [A1], cl
2416 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2417 %endif
2418 EPILOGUE_3_ARGS
2419ENDPROC iemAImpl_ %+ %1 %+ _u32
2420
2421 %ifdef RT_ARCH_AMD64
2422BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2423 PROLOGUE_3_ARGS
2424 %ifdef ASM_CALL64_GCC
2425 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2426 mov cl, A2_8
2427 %1 qword [A1], cl
2428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2429 %else
2430 xchg A2, A0
2431 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2432 %1 qword [A1], cl
2433 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2434 %endif
2435 EPILOGUE_3_ARGS
2436ENDPROC iemAImpl_ %+ %1 %+ _u64
2437 %endif ; RT_ARCH_AMD64
2438
2439%endmacro
2440
2441; These instructions will NOT modify flags if the masked shift count is zero
2442; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2443; we have to force load all modified and undefined.
2444IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2445IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2449IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451
2452
2453;;
2454; Macro for implementing a double precision shift operation.
2455;
2456; This will generate code for the 16, 32 and 64 bit accesses, except on
2457; 32-bit system where the 64-bit accesses requires hand coding.
2458;
2459; The functions takes the destination operand (r/m) in A0, the source (reg) in
2460; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2461;
2462; @param 1 The instruction mnemonic.
2463; @param 2 The modified flags.
2464; @param 3 The undefined flags.
2465; @param 4 The force loaded flags.
2466;
2467; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2468;
2469; @note the _intel and _amd variants are implemented in C.
2470;
2471%macro IEMIMPL_SHIFT_DBL_OP 4
2472BEGINCODE
2473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2474 PROLOGUE_4_ARGS
2475 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2476 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2477 %ifdef ASM_CALL64_GCC
2478 xchg A3, A2
2479 %1 [A0], A1_16, cl
2480 xchg A3, A2
2481 %else
2482 xchg A0, A2
2483 %1 [A2], A1_16, cl
2484 %endif
2485 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2486 EPILOGUE_4_ARGS
2487ENDPROC iemAImpl_ %+ %1 %+ _u16
2488
2489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2490 PROLOGUE_4_ARGS
2491 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2492 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2493 %ifdef ASM_CALL64_GCC
2494 xchg A3, A2
2495 %1 [A0], A1_32, cl
2496 xchg A3, A2
2497 %else
2498 xchg A0, A2
2499 %1 [A2], A1_32, cl
2500 %endif
2501 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2502 EPILOGUE_4_ARGS
2503ENDPROC iemAImpl_ %+ %1 %+ _u32
2504
2505 %ifdef RT_ARCH_AMD64
2506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2507 PROLOGUE_4_ARGS
2508 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2509 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2510 %ifdef ASM_CALL64_GCC
2511 xchg A3, A2
2512 %1 [A0], A1, cl
2513 xchg A3, A2
2514 %else
2515 xchg A0, A2
2516 %1 [A2], A1, cl
2517 %endif
2518 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2519 EPILOGUE_4_ARGS_EX 12
2520ENDPROC iemAImpl_ %+ %1 %+ _u64
2521 %endif ; RT_ARCH_AMD64
2522
2523%endmacro
2524
2525; These instructions will NOT modify flags if the masked shift count is zero
2526; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2527; we have to force load all modified and undefined.
2528IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2529IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530
2531
2532;;
2533; Macro for implementing a multiplication operations.
2534;
2535; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2536; 32-bit system where the 64-bit accesses requires hand coding.
2537;
2538; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2539; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2540; pointer to eflags in A3.
2541;
2542; The functions all return 0 so the caller can be used for div/idiv as well as
2543; for the mul/imul implementation.
2544;
2545; @param 1 The instruction mnemonic.
2546; @param 2 The modified flags.
2547; @param 3 The undefined flags.
2548; @param 4 Name suffix.
2549; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2550;
2551; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2552;
2553%macro IEMIMPL_MUL_OP 5
2554BEGINCODE
2555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2556 PROLOGUE_3_ARGS
2557 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2558 mov al, [A0]
2559 %1 A1_8
2560 mov [A0], ax
2561 %if %5 != 1
2562 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2563 %else
2564 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2565 %endif
2566 xor eax, eax
2567 EPILOGUE_3_ARGS
2568ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2569
2570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2571 PROLOGUE_4_ARGS
2572 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2573 mov ax, [A0]
2574 %ifdef ASM_CALL64_GCC
2575 %1 A2_16
2576 mov [A0], ax
2577 mov [A1], dx
2578 %else
2579 mov T1, A1
2580 %1 A2_16
2581 mov [A0], ax
2582 mov [T1], dx
2583 %endif
2584 %if %5 != 1
2585 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2586 %else
2587 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2588 %endif
2589 xor eax, eax
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2592
2593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2594 PROLOGUE_4_ARGS
2595 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2596 mov eax, [A0]
2597 %ifdef ASM_CALL64_GCC
2598 %1 A2_32
2599 mov [A0], eax
2600 mov [A1], edx
2601 %else
2602 mov T1, A1
2603 %1 A2_32
2604 mov [A0], eax
2605 mov [T1], edx
2606 %endif
2607 %if %5 != 1
2608 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2609 %else
2610 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2611 %endif
2612 xor eax, eax
2613 EPILOGUE_4_ARGS
2614ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2615
2616 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2618 PROLOGUE_4_ARGS
2619 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2620 mov rax, [A0]
2621 %ifdef ASM_CALL64_GCC
2622 %1 A2
2623 mov [A0], rax
2624 mov [A1], rdx
2625 %else
2626 mov T1, A1
2627 %1 A2
2628 mov [A0], rax
2629 mov [T1], rdx
2630 %endif
2631 %if %5 != 1
2632 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2633 %else
2634 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2635 %endif
2636 xor eax, eax
2637 EPILOGUE_4_ARGS_EX 12
2638ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2639 %endif ; !RT_ARCH_AMD64
2640
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2691; pointer to eflags in A3.
2692;
2693; The functions all return 0 on success and -1 if a divide error should be
2694; raised by the caller.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2761 %endif
2762 xor eax, eax
2763
2764.return:
2765 EPILOGUE_3_ARGS
2766
2767.div_zero:
2768.div_overflow:
2769 mov eax, -1
2770 jmp .return
2771ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2772
2773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2774 PROLOGUE_4_ARGS
2775
2776 ; div by chainsaw check.
2777 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2778 jz .div_zero
2779
2780 ; Overflow check - unsigned division is simple to verify, haven't
2781 ; found a simple way to check signed division yet unfortunately.
2782 %if %4 == 0
2783 cmp [A1], A2_16
2784 jae .div_overflow
2785 %else
2786 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2787 shl T0_32, 16
2788 mov T0_16, [A0] ; T0 = dividend
2789 mov T1, A2 ; T1 = divisor
2790 test T1_16, T1_16
2791 js .divisor_negative
2792 test T0_32, T0_32
2793 jns .both_positive
2794 neg T0_32
2795.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2796 push T0 ; Start off like unsigned below.
2797 shr T0_32, 15
2798 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2799 pop T0
2800 jb .div_no_overflow
2801 ja .div_overflow
2802 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2803 cmp T0_16, T1_16
2804 jae .div_overflow
2805 jmp .div_no_overflow
2806
2807.divisor_negative:
2808 neg T1_16
2809 test T0_32, T0_32
2810 jns .one_of_each
2811 neg T0_32
2812.both_positive: ; Same as unsigned shifted by sign indicator bit.
2813 shr T0_32, 15
2814 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2815 jae .div_overflow
2816.div_no_overflow:
2817 %endif
2818
2819 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2820 %ifdef ASM_CALL64_GCC
2821 mov T1, A2
2822 mov ax, [A0]
2823 mov dx, [A1]
2824 %1 T1_16
2825 mov [A0], ax
2826 mov [A1], dx
2827 %else
2828 mov T1, A1
2829 mov ax, [A0]
2830 mov dx, [T1]
2831 %1 A2_16
2832 mov [A0], ax
2833 mov [T1], dx
2834 %endif
2835 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2836 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2837 %else
2838 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2839 %endif
2840 xor eax, eax
2841
2842.return:
2843 EPILOGUE_4_ARGS
2844
2845.div_zero:
2846.div_overflow:
2847 mov eax, -1
2848 jmp .return
2849ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2850
2851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2852 PROLOGUE_4_ARGS
2853
2854 ; div by chainsaw check.
2855 test A2_32, A2_32
2856 jz .div_zero
2857
2858 ; Overflow check - unsigned division is simple to verify, haven't
2859 ; found a simple way to check signed division yet unfortunately.
2860 %if %4 == 0
2861 cmp [A1], A2_32
2862 jae .div_overflow
2863 %else
2864 push A2 ; save A2 so we modify it (we out of regs on x86).
2865 mov T0_32, [A0] ; T0 = dividend low
2866 mov T1_32, [A1] ; T1 = dividend high
2867 ;test A2_32, A2_32 - we did this 5 instructions ago.
2868 js .divisor_negative
2869 test T1_32, T1_32
2870 jns .both_positive
2871 call NAME(iemAImpl_negate_T0_T1_u32)
2872.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2873 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2874 jnz .div_overflow
2875 push T0 ; Start off like unsigned below.
2876 shl T1_32, 1
2877 shr T0_32, 31
2878 or T1_32, T0_32
2879 cmp T1_32, A2_32
2880 pop T0
2881 jb .div_no_overflow
2882 ja .div_overflow
2883 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2884 cmp T0_32, A2_32
2885 jae .div_overflow
2886 jmp .div_no_overflow
2887
2888.divisor_negative:
2889 neg A2_32
2890 test T1_32, T1_32
2891 jns .one_of_each
2892 call NAME(iemAImpl_negate_T0_T1_u32)
2893.both_positive: ; Same as unsigned shifted by sign indicator bit.
2894 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2895 jnz .div_overflow
2896 shl T1_32, 1
2897 shr T0_32, 31
2898 or T1_32, T0_32
2899 cmp T1_32, A2_32
2900 jae .div_overflow
2901.div_no_overflow:
2902 pop A2
2903 %endif
2904
2905 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2906 mov eax, [A0]
2907 %ifdef ASM_CALL64_GCC
2908 mov T1, A2
2909 mov eax, [A0]
2910 mov edx, [A1]
2911 %1 T1_32
2912 mov [A0], eax
2913 mov [A1], edx
2914 %else
2915 mov T1, A1
2916 mov eax, [A0]
2917 mov edx, [T1]
2918 %1 A2_32
2919 mov [A0], eax
2920 mov [T1], edx
2921 %endif
2922 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2923 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2924 %else
2925 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2926 %endif
2927 xor eax, eax
2928
2929.return:
2930 EPILOGUE_4_ARGS
2931
2932.div_overflow:
2933 %if %4 != 0
2934 pop A2
2935 %endif
2936.div_zero:
2937 mov eax, -1
2938 jmp .return
2939ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2940
2941 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2942BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2943 PROLOGUE_4_ARGS
2944
2945 test A2, A2
2946 jz .div_zero
2947 %if %4 == 0
2948 cmp [A1], A2
2949 jae .div_overflow
2950 %else
2951 push A2 ; save A2 so we modify it (we out of regs on x86).
2952 mov T0, [A0] ; T0 = dividend low
2953 mov T1, [A1] ; T1 = dividend high
2954 ;test A2, A2 - we did this five instructions above.
2955 js .divisor_negative
2956 test T1, T1
2957 jns .both_positive
2958 call NAME(iemAImpl_negate_T0_T1_u64)
2959.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2960 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2961 jc .div_overflow
2962 push T0 ; Start off like unsigned below.
2963 shl T1, 1
2964 shr T0, 63
2965 or T1, T0
2966 cmp T1, A2
2967 pop T0
2968 jb .div_no_overflow
2969 ja .div_overflow
2970 mov T1, 0x7fffffffffffffff
2971 and T0, T1 ; Special case for covering (divisor - 1).
2972 cmp T0, A2
2973 jae .div_overflow
2974 jmp .div_no_overflow
2975
2976.divisor_negative:
2977 neg A2
2978 test T1, T1
2979 jns .one_of_each
2980 call NAME(iemAImpl_negate_T0_T1_u64)
2981.both_positive: ; Same as unsigned shifted by sign indicator bit.
2982 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2983 jc .div_overflow
2984 shl T1, 1
2985 shr T0, 63
2986 or T1, T0
2987 cmp T1, A2
2988 jae .div_overflow
2989.div_no_overflow:
2990 pop A2
2991 %endif
2992
2993 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2994 mov rax, [A0]
2995 %ifdef ASM_CALL64_GCC
2996 mov T1, A2
2997 mov rax, [A0]
2998 mov rdx, [A1]
2999 %1 T1
3000 mov [A0], rax
3001 mov [A1], rdx
3002 %else
3003 mov T1, A1
3004 mov rax, [A0]
3005 mov rdx, [T1]
3006 %1 A2
3007 mov [A0], rax
3008 mov [T1], rdx
3009 %endif
3010 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3011 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3012 %else
3013 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3014 %endif
3015 xor eax, eax
3016
3017.return:
3018 EPILOGUE_4_ARGS_EX 12
3019
3020.div_overflow:
3021 %if %4 != 0
3022 pop A2
3023 %endif
3024.div_zero:
3025 mov eax, -1
3026 jmp .return
3027ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3028 %endif ; !RT_ARCH_AMD64
3029
3030%endmacro
3031
3032IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3033IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3034IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3035;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3036IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3037IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3038IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3039
3040
3041;;
3042; Macro for implementing memory fence operation.
3043;
3044; No return value, no operands or anything.
3045;
3046; @param 1 The instruction.
3047;
3048%macro IEMIMPL_MEM_FENCE 1
3049BEGINCODE
3050BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3051 %1
3052 ret
3053ENDPROC iemAImpl_ %+ %1
3054%endmacro
3055
3056IEMIMPL_MEM_FENCE lfence
3057IEMIMPL_MEM_FENCE sfence
3058IEMIMPL_MEM_FENCE mfence
3059
3060;;
3061; Alternative for non-SSE2 host.
3062;
3063BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3064 push xAX
3065 xchg xAX, [xSP]
3066 add xSP, xCB
3067 ret
3068ENDPROC iemAImpl_alt_mem_fence
3069
3070
3071;;
3072; Initialize the FPU for the actual instruction being emulated, this means
3073; loading parts of the guest's control word and status word.
3074;
3075; @uses 24 bytes of stack. T0, T1
3076; @param 1 Expression giving the address of the FXSTATE of the guest.
3077;
3078%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3079 fnstenv [xSP]
3080
3081 ; FCW - for exception, precision and rounding control.
3082 movzx T0, word [%1 + X86FXSTATE.FCW]
3083 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3084 mov [xSP + X86FSTENV32P.FCW], T0_16
3085
3086 ; FSW - for undefined C0, C1, C2, and C3.
3087 movzx T1, word [%1 + X86FXSTATE.FSW]
3088 and T1, X86_FSW_C_MASK
3089 movzx T0, word [xSP + X86FSTENV32P.FSW]
3090 and T0, X86_FSW_TOP_MASK
3091 or T0, T1
3092 mov [xSP + X86FSTENV32P.FSW], T0_16
3093
3094 fldenv [xSP]
3095%endmacro
3096
3097
3098;;
3099; Initialize the FPU for the actual instruction being emulated, this means
3100; loading parts of the guest's control word, status word, and update the
3101; tag word for the top register if it's empty.
3102;
3103; ASSUMES actual TOP=7
3104;
3105; @uses 24 bytes of stack. T0, T1
3106; @param 1 Expression giving the address of the FXSTATE of the guest.
3107;
3108%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3109 fnstenv [xSP]
3110
3111 ; FCW - for exception, precision and rounding control.
3112 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3113 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3114 mov [xSP + X86FSTENV32P.FCW], T0_16
3115
3116 ; FSW - for undefined C0, C1, C2, and C3.
3117 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3118 and T1_32, X86_FSW_C_MASK
3119 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3120 and T0_32, X86_FSW_TOP_MASK
3121 or T0_32, T1_32
3122 mov [xSP + X86FSTENV32P.FSW], T0_16
3123
3124 ; FTW - Only for ST0 (in/out).
3125 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3126 shr T1_32, X86_FSW_TOP_SHIFT
3127 and T1_32, X86_FSW_TOP_SMASK
3128 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3129 jc %%st0_not_empty
3130 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3131%%st0_not_empty:
3132
3133 fldenv [xSP]
3134%endmacro
3135
3136
3137;;
3138; Need to move this as well somewhere better?
3139;
3140struc IEMFPURESULT
3141 .r80Result resw 5
3142 .FSW resw 1
3143endstruc
3144
3145
3146;;
3147; Need to move this as well somewhere better?
3148;
3149struc IEMFPURESULTTWO
3150 .r80Result1 resw 5
3151 .FSW resw 1
3152 .r80Result2 resw 5
3153endstruc
3154
3155
3156;
3157;---------------------- 16-bit signed integer operations ----------------------
3158;
3159
3160
3161;;
3162; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3163;
3164; @param A0 FPU context (fxsave).
3165; @param A1 Pointer to a IEMFPURESULT for the output.
3166; @param A2 Pointer to the 16-bit floating point value to convert.
3167;
3168BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3169 PROLOGUE_3_ARGS
3170 sub xSP, 20h
3171
3172 fninit
3173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3174 fild word [A2]
3175
3176 fnstsw word [A1 + IEMFPURESULT.FSW]
3177 fnclex
3178 fstp tword [A1 + IEMFPURESULT.r80Result]
3179
3180 fninit
3181 add xSP, 20h
3182 EPILOGUE_3_ARGS
3183ENDPROC iemAImpl_fild_r80_from_i16
3184
3185
3186;;
3187; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Where to return the output FSW.
3191; @param A2 Where to store the 16-bit signed integer value.
3192; @param A3 Pointer to the 80-bit value.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3195 PROLOGUE_4_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 fld tword [A3]
3200 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3201 fistp word [A2]
3202
3203 fnstsw word [A1]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_4_ARGS
3208ENDPROC iemAImpl_fist_r80_to_i16
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 16-bit signed integer
3213; (memory) with truncation.
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 16-bit signed integer value.
3218; @param A3 Pointer to the 80-bit value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fisttp word [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fistt_r80_to_i16
3235
3236
3237;;
3238; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3239;
3240; @param 1 The instruction
3241;
3242; @param A0 FPU context (fxsave).
3243; @param A1 Pointer to a IEMFPURESULT for the output.
3244; @param A2 Pointer to the 80-bit value.
3245; @param A3 Pointer to the 16-bit value.
3246;
3247%macro IEMIMPL_FPU_R80_BY_I16 1
3248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3249 PROLOGUE_4_ARGS
3250 sub xSP, 20h
3251
3252 fninit
3253 fld tword [A2]
3254 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3255 %1 word [A3]
3256
3257 fnstsw word [A1 + IEMFPURESULT.FSW]
3258 fnclex
3259 fstp tword [A1 + IEMFPURESULT.r80Result]
3260
3261 fninit
3262 add xSP, 20h
3263 EPILOGUE_4_ARGS
3264ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3265%endmacro
3266
3267IEMIMPL_FPU_R80_BY_I16 fiadd
3268IEMIMPL_FPU_R80_BY_I16 fimul
3269IEMIMPL_FPU_R80_BY_I16 fisub
3270IEMIMPL_FPU_R80_BY_I16 fisubr
3271IEMIMPL_FPU_R80_BY_I16 fidiv
3272IEMIMPL_FPU_R80_BY_I16 fidivr
3273
3274
3275;;
3276; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3277; only returning FSW.
3278;
3279; @param 1 The instruction
3280;
3281; @param A0 FPU context (fxsave).
3282; @param A1 Where to store the output FSW.
3283; @param A2 Pointer to the 80-bit value.
3284; @param A3 Pointer to the 64-bit value.
3285;
3286%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3288 PROLOGUE_4_ARGS
3289 sub xSP, 20h
3290
3291 fninit
3292 fld tword [A2]
3293 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3294 %1 word [A3]
3295
3296 fnstsw word [A1]
3297
3298 fninit
3299 add xSP, 20h
3300 EPILOGUE_4_ARGS
3301ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3302%endmacro
3303
3304IEMIMPL_FPU_R80_BY_I16_FSW ficom
3305
3306
3307
3308;
3309;---------------------- 32-bit signed integer operations ----------------------
3310;
3311
3312
3313;;
3314; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Pointer to a IEMFPURESULT for the output.
3318; @param A2 Pointer to the 32-bit floating point value to convert.
3319;
3320BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3321 PROLOGUE_3_ARGS
3322 sub xSP, 20h
3323
3324 fninit
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fild dword [A2]
3327
3328 fnstsw word [A1 + IEMFPURESULT.FSW]
3329 fnclex
3330 fstp tword [A1 + IEMFPURESULT.r80Result]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_3_ARGS
3335ENDPROC iemAImpl_fild_r80_from_i32
3336
3337
3338;;
3339; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Where to return the output FSW.
3343; @param A2 Where to store the 32-bit signed integer value.
3344; @param A3 Pointer to the 80-bit value.
3345;
3346BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3347 PROLOGUE_4_ARGS
3348 sub xSP, 20h
3349
3350 fninit
3351 fld tword [A3]
3352 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3353 fistp dword [A2]
3354
3355 fnstsw word [A1]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_4_ARGS
3360ENDPROC iemAImpl_fist_r80_to_i32
3361
3362
3363;;
3364; Store a 80-bit floating point value (register) as a 32-bit signed integer
3365; (memory) with truncation.
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Where to return the output FSW.
3369; @param A2 Where to store the 32-bit signed integer value.
3370; @param A3 Pointer to the 80-bit value.
3371;
3372BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A3]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 fisttp dword [A2]
3380
3381 fnstsw word [A1]
3382
3383 fninit
3384 add xSP, 20h
3385 EPILOGUE_4_ARGS
3386ENDPROC iemAImpl_fistt_r80_to_i32
3387
3388
3389;;
3390; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3391;
3392; @param 1 The instruction
3393;
3394; @param A0 FPU context (fxsave).
3395; @param A1 Pointer to a IEMFPURESULT for the output.
3396; @param A2 Pointer to the 80-bit value.
3397; @param A3 Pointer to the 32-bit value.
3398;
3399%macro IEMIMPL_FPU_R80_BY_I32 1
3400BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3401 PROLOGUE_4_ARGS
3402 sub xSP, 20h
3403
3404 fninit
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 dword [A3]
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_I32 fiadd
3420IEMIMPL_FPU_R80_BY_I32 fimul
3421IEMIMPL_FPU_R80_BY_I32 fisub
3422IEMIMPL_FPU_R80_BY_I32 fisubr
3423IEMIMPL_FPU_R80_BY_I32 fidiv
3424IEMIMPL_FPU_R80_BY_I32 fidivr
3425
3426
3427;;
3428; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3429; only returning FSW.
3430;
3431; @param 1 The instruction
3432;
3433; @param A0 FPU context (fxsave).
3434; @param A1 Where to store the output FSW.
3435; @param A2 Pointer to the 80-bit value.
3436; @param A3 Pointer to the 64-bit value.
3437;
3438%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3440 PROLOGUE_4_ARGS
3441 sub xSP, 20h
3442
3443 fninit
3444 fld tword [A2]
3445 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3446 %1 dword [A3]
3447
3448 fnstsw word [A1]
3449
3450 fninit
3451 add xSP, 20h
3452 EPILOGUE_4_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3454%endmacro
3455
3456IEMIMPL_FPU_R80_BY_I32_FSW ficom
3457
3458
3459
3460;
3461;---------------------- 64-bit signed integer operations ----------------------
3462;
3463
3464
3465;;
3466; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to a IEMFPURESULT for the output.
3470; @param A2 Pointer to the 64-bit floating point value to convert.
3471;
3472BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3473 PROLOGUE_3_ARGS
3474 sub xSP, 20h
3475
3476 fninit
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3478 fild qword [A2]
3479
3480 fnstsw word [A1 + IEMFPURESULT.FSW]
3481 fnclex
3482 fstp tword [A1 + IEMFPURESULT.r80Result]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_fild_r80_from_i64
3488
3489
3490;;
3491; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3492;
3493; @param A0 FPU context (fxsave).
3494; @param A1 Where to return the output FSW.
3495; @param A2 Where to store the 64-bit signed integer value.
3496; @param A3 Pointer to the 80-bit value.
3497;
3498BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3499 PROLOGUE_4_ARGS
3500 sub xSP, 20h
3501
3502 fninit
3503 fld tword [A3]
3504 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3505 fistp qword [A2]
3506
3507 fnstsw word [A1]
3508
3509 fninit
3510 add xSP, 20h
3511 EPILOGUE_4_ARGS
3512ENDPROC iemAImpl_fist_r80_to_i64
3513
3514
3515;;
3516; Store a 80-bit floating point value (register) as a 64-bit signed integer
3517; (memory) with truncation.
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Where to return the output FSW.
3521; @param A2 Where to store the 64-bit signed integer value.
3522; @param A3 Pointer to the 80-bit value.
3523;
3524BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3525 PROLOGUE_4_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A3]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 fisttp qword [A2]
3532
3533 fnstsw word [A1]
3534
3535 fninit
3536 add xSP, 20h
3537 EPILOGUE_4_ARGS
3538ENDPROC iemAImpl_fistt_r80_to_i64
3539
3540
3541
3542;
3543;---------------------- 32-bit floating point operations ----------------------
3544;
3545
3546;;
3547; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 32-bit floating point value to convert.
3552;
3553BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3554 PROLOGUE_3_ARGS
3555 sub xSP, 20h
3556
3557 fninit
3558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3559 fld dword [A2]
3560
3561 fnstsw word [A1 + IEMFPURESULT.FSW]
3562 fnclex
3563 fstp tword [A1 + IEMFPURESULT.r80Result]
3564
3565 fninit
3566 add xSP, 20h
3567 EPILOGUE_3_ARGS
3568ENDPROC iemAImpl_fld_r80_from_r32
3569
3570
3571;;
3572; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3573;
3574; @param A0 FPU context (fxsave).
3575; @param A1 Where to return the output FSW.
3576; @param A2 Where to store the 32-bit value.
3577; @param A3 Pointer to the 80-bit value.
3578;
3579BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3580 PROLOGUE_4_ARGS
3581 sub xSP, 20h
3582
3583 fninit
3584 fld tword [A3]
3585 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3586 fst dword [A2]
3587
3588 fnstsw word [A1]
3589
3590 fninit
3591 add xSP, 20h
3592 EPILOGUE_4_ARGS
3593ENDPROC iemAImpl_fst_r80_to_r32
3594
3595
3596;;
3597; FPU instruction working on one 80-bit and one 32-bit floating point value.
3598;
3599; @param 1 The instruction
3600;
3601; @param A0 FPU context (fxsave).
3602; @param A1 Pointer to a IEMFPURESULT for the output.
3603; @param A2 Pointer to the 80-bit value.
3604; @param A3 Pointer to the 32-bit value.
3605;
3606%macro IEMIMPL_FPU_R80_BY_R32 1
3607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3608 PROLOGUE_4_ARGS
3609 sub xSP, 20h
3610
3611 fninit
3612 fld tword [A2]
3613 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3614 %1 dword [A3]
3615
3616 fnstsw word [A1 + IEMFPURESULT.FSW]
3617 fnclex
3618 fstp tword [A1 + IEMFPURESULT.r80Result]
3619
3620 fninit
3621 add xSP, 20h
3622 EPILOGUE_4_ARGS
3623ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3624%endmacro
3625
3626IEMIMPL_FPU_R80_BY_R32 fadd
3627IEMIMPL_FPU_R80_BY_R32 fmul
3628IEMIMPL_FPU_R80_BY_R32 fsub
3629IEMIMPL_FPU_R80_BY_R32 fsubr
3630IEMIMPL_FPU_R80_BY_R32 fdiv
3631IEMIMPL_FPU_R80_BY_R32 fdivr
3632
3633
3634;;
3635; FPU instruction working on one 80-bit and one 32-bit floating point value,
3636; only returning FSW.
3637;
3638; @param 1 The instruction
3639;
3640; @param A0 FPU context (fxsave).
3641; @param A1 Where to store the output FSW.
3642; @param A2 Pointer to the 80-bit value.
3643; @param A3 Pointer to the 64-bit value.
3644;
3645%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3647 PROLOGUE_4_ARGS
3648 sub xSP, 20h
3649
3650 fninit
3651 fld tword [A2]
3652 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3653 %1 dword [A3]
3654
3655 fnstsw word [A1]
3656
3657 fninit
3658 add xSP, 20h
3659 EPILOGUE_4_ARGS
3660ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3661%endmacro
3662
3663IEMIMPL_FPU_R80_BY_R32_FSW fcom
3664
3665
3666
3667;
3668;---------------------- 64-bit floating point operations ----------------------
3669;
3670
3671;;
3672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3673;
3674; @param A0 FPU context (fxsave).
3675; @param A1 Pointer to a IEMFPURESULT for the output.
3676; @param A2 Pointer to the 64-bit floating point value to convert.
3677;
3678BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3679 PROLOGUE_3_ARGS
3680 sub xSP, 20h
3681
3682 fninit
3683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3684 fld qword [A2]
3685
3686 fnstsw word [A1 + IEMFPURESULT.FSW]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULT.r80Result]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_fld_r80_from_r64
3694
3695
3696;;
3697; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3698;
3699; @param A0 FPU context (fxsave).
3700; @param A1 Where to return the output FSW.
3701; @param A2 Where to store the 64-bit value.
3702; @param A3 Pointer to the 80-bit value.
3703;
3704BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3705 PROLOGUE_4_ARGS
3706 sub xSP, 20h
3707
3708 fninit
3709 fld tword [A3]
3710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3711 fst qword [A2]
3712
3713 fnstsw word [A1]
3714
3715 fninit
3716 add xSP, 20h
3717 EPILOGUE_4_ARGS
3718ENDPROC iemAImpl_fst_r80_to_r64
3719
3720
3721;;
3722; FPU instruction working on one 80-bit and one 64-bit floating point value.
3723;
3724; @param 1 The instruction
3725;
3726; @param A0 FPU context (fxsave).
3727; @param A1 Pointer to a IEMFPURESULT for the output.
3728; @param A2 Pointer to the 80-bit value.
3729; @param A3 Pointer to the 64-bit value.
3730;
3731%macro IEMIMPL_FPU_R80_BY_R64 1
3732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3733 PROLOGUE_4_ARGS
3734 sub xSP, 20h
3735
3736 fninit
3737 fld tword [A2]
3738 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3739 %1 qword [A3]
3740
3741 fnstsw word [A1 + IEMFPURESULT.FSW]
3742 fnclex
3743 fstp tword [A1 + IEMFPURESULT.r80Result]
3744
3745 fninit
3746 add xSP, 20h
3747 EPILOGUE_4_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3749%endmacro
3750
3751IEMIMPL_FPU_R80_BY_R64 fadd
3752IEMIMPL_FPU_R80_BY_R64 fmul
3753IEMIMPL_FPU_R80_BY_R64 fsub
3754IEMIMPL_FPU_R80_BY_R64 fsubr
3755IEMIMPL_FPU_R80_BY_R64 fdiv
3756IEMIMPL_FPU_R80_BY_R64 fdivr
3757
3758;;
3759; FPU instruction working on one 80-bit and one 64-bit floating point value,
3760; only returning FSW.
3761;
3762; @param 1 The instruction
3763;
3764; @param A0 FPU context (fxsave).
3765; @param A1 Where to store the output FSW.
3766; @param A2 Pointer to the 80-bit value.
3767; @param A3 Pointer to the 64-bit value.
3768;
3769%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3771 PROLOGUE_4_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 fld tword [A2]
3776 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3777 %1 qword [A3]
3778
3779 fnstsw word [A1]
3780
3781 fninit
3782 add xSP, 20h
3783 EPILOGUE_4_ARGS
3784ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3785%endmacro
3786
3787IEMIMPL_FPU_R80_BY_R64_FSW fcom
3788
3789
3790
3791;
3792;---------------------- 80-bit floating point operations ----------------------
3793;
3794
3795;;
3796; Loads a 80-bit floating point register value from memory.
3797;
3798; @param A0 FPU context (fxsave).
3799; @param A1 Pointer to a IEMFPURESULT for the output.
3800; @param A2 Pointer to the 80-bit floating point value to load.
3801;
3802BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3803 PROLOGUE_3_ARGS
3804 sub xSP, 20h
3805
3806 fninit
3807 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3808 fld tword [A2]
3809
3810 fnstsw word [A1 + IEMFPURESULT.FSW]
3811 fnclex
3812 fstp tword [A1 + IEMFPURESULT.r80Result]
3813
3814 fninit
3815 add xSP, 20h
3816 EPILOGUE_3_ARGS
3817ENDPROC iemAImpl_fld_r80_from_r80
3818
3819
3820;;
3821; Store a 80-bit floating point register to memory
3822;
3823; @param A0 FPU context (fxsave).
3824; @param A1 Where to return the output FSW.
3825; @param A2 Where to store the 80-bit value.
3826; @param A3 Pointer to the 80-bit register value.
3827;
3828BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3829 PROLOGUE_4_ARGS
3830 sub xSP, 20h
3831
3832 fninit
3833 fld tword [A3]
3834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3835 fstp tword [A2]
3836
3837 fnstsw word [A1]
3838
3839 fninit
3840 add xSP, 20h
3841 EPILOGUE_4_ARGS
3842ENDPROC iemAImpl_fst_r80_to_r80
3843
3844
3845;;
3846; Loads an 80-bit floating point register value in BCD format from memory.
3847;
3848; @param A0 FPU context (fxsave).
3849; @param A1 Pointer to a IEMFPURESULT for the output.
3850; @param A2 Pointer to the 80-bit BCD value to load.
3851;
3852BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3853 PROLOGUE_3_ARGS
3854 sub xSP, 20h
3855
3856 fninit
3857 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3858 fbld tword [A2]
3859
3860 fnstsw word [A1 + IEMFPURESULT.FSW]
3861 fnclex
3862 fstp tword [A1 + IEMFPURESULT.r80Result]
3863
3864 fninit
3865 add xSP, 20h
3866 EPILOGUE_3_ARGS
3867ENDPROC iemAImpl_fld_r80_from_d80
3868
3869
3870;;
3871; Store a 80-bit floating point register to memory as BCD
3872;
3873; @param A0 FPU context (fxsave).
3874; @param A1 Where to return the output FSW.
3875; @param A2 Where to store the 80-bit BCD value.
3876; @param A3 Pointer to the 80-bit register value.
3877;
3878BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3879 PROLOGUE_4_ARGS
3880 sub xSP, 20h
3881
3882 fninit
3883 fld tword [A3]
3884 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3885 fbstp tword [A2]
3886
3887 fnstsw word [A1]
3888
3889 fninit
3890 add xSP, 20h
3891 EPILOGUE_4_ARGS
3892ENDPROC iemAImpl_fst_r80_to_d80
3893
3894
3895;;
3896; FPU instruction working on two 80-bit floating point values.
3897;
3898; @param 1 The instruction
3899;
3900; @param A0 FPU context (fxsave).
3901; @param A1 Pointer to a IEMFPURESULT for the output.
3902; @param A2 Pointer to the first 80-bit value (ST0)
3903; @param A3 Pointer to the second 80-bit value (STn).
3904;
3905%macro IEMIMPL_FPU_R80_BY_R80 2
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3907 PROLOGUE_4_ARGS
3908 sub xSP, 20h
3909
3910 fninit
3911 fld tword [A3]
3912 fld tword [A2]
3913 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3914 %1 %2
3915
3916 fnstsw word [A1 + IEMFPURESULT.FSW]
3917 fnclex
3918 fstp tword [A1 + IEMFPURESULT.r80Result]
3919
3920 fninit
3921 add xSP, 20h
3922 EPILOGUE_4_ARGS
3923ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3924%endmacro
3925
3926IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3928IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3929IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3930IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3931IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3932IEMIMPL_FPU_R80_BY_R80 fprem, {}
3933IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3934IEMIMPL_FPU_R80_BY_R80 fscale, {}
3935
3936
3937;;
3938; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3939; storing the result in ST1 and popping the stack.
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 FPU context (fxsave).
3944; @param A1 Pointer to a IEMFPURESULT for the output.
3945; @param A2 Pointer to the first 80-bit value (ST1).
3946; @param A3 Pointer to the second 80-bit value (ST0).
3947;
3948%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3950 PROLOGUE_4_ARGS
3951 sub xSP, 20h
3952
3953 fninit
3954 fld tword [A2]
3955 fld tword [A3]
3956 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3957 %1
3958
3959 fnstsw word [A1 + IEMFPURESULT.FSW]
3960 fnclex
3961 fstp tword [A1 + IEMFPURESULT.r80Result]
3962
3963 fninit
3964 add xSP, 20h
3965 EPILOGUE_4_ARGS
3966ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3967%endmacro
3968
3969IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3970IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3971IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3972
3973
3974;;
3975; FPU instruction working on two 80-bit floating point values, only
3976; returning FSW.
3977;
3978; @param 1 The instruction
3979;
3980; @param A0 FPU context (fxsave).
3981; @param A1 Pointer to a uint16_t for the resulting FSW.
3982; @param A2 Pointer to the first 80-bit value.
3983; @param A3 Pointer to the second 80-bit value.
3984;
3985%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3987 PROLOGUE_4_ARGS
3988 sub xSP, 20h
3989
3990 fninit
3991 fld tword [A3]
3992 fld tword [A2]
3993 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3994 %1 st0, st1
3995
3996 fnstsw word [A1]
3997
3998 fninit
3999 add xSP, 20h
4000 EPILOGUE_4_ARGS
4001ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4002%endmacro
4003
4004IEMIMPL_FPU_R80_BY_R80_FSW fcom
4005IEMIMPL_FPU_R80_BY_R80_FSW fucom
4006
4007
4008;;
4009; FPU instruction working on two 80-bit floating point values,
4010; returning FSW and EFLAGS (eax).
4011;
4012; @param 1 The instruction
4013;
4014; @returns EFLAGS in EAX.
4015; @param A0 FPU context (fxsave).
4016; @param A1 Pointer to a uint16_t for the resulting FSW.
4017; @param A2 Pointer to the first 80-bit value.
4018; @param A3 Pointer to the second 80-bit value.
4019;
4020%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4022 PROLOGUE_4_ARGS
4023 sub xSP, 20h
4024
4025 fninit
4026 fld tword [A3]
4027 fld tword [A2]
4028 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4029 %1 st1
4030
4031 fnstsw word [A1]
4032 pushf
4033 pop xAX
4034
4035 fninit
4036 add xSP, 20h
4037 EPILOGUE_4_ARGS
4038ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4039%endmacro
4040
4041IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4042IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4043
4044
4045;;
4046; FPU instruction working on one 80-bit floating point value.
4047;
4048; @param 1 The instruction
4049;
4050; @param A0 FPU context (fxsave).
4051; @param A1 Pointer to a IEMFPURESULT for the output.
4052; @param A2 Pointer to the 80-bit value.
4053;
4054%macro IEMIMPL_FPU_R80 1
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4056 PROLOGUE_3_ARGS
4057 sub xSP, 20h
4058
4059 fninit
4060 fld tword [A2]
4061 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4062 %1
4063
4064 fnstsw word [A1 + IEMFPURESULT.FSW]
4065 fnclex
4066 fstp tword [A1 + IEMFPURESULT.r80Result]
4067
4068 fninit
4069 add xSP, 20h
4070 EPILOGUE_3_ARGS
4071ENDPROC iemAImpl_ %+ %1 %+ _r80
4072%endmacro
4073
4074IEMIMPL_FPU_R80 fchs
4075IEMIMPL_FPU_R80 fabs
4076IEMIMPL_FPU_R80 f2xm1
4077IEMIMPL_FPU_R80 fsqrt
4078IEMIMPL_FPU_R80 frndint
4079IEMIMPL_FPU_R80 fsin
4080IEMIMPL_FPU_R80 fcos
4081
4082
4083;;
4084; FPU instruction working on one 80-bit floating point value, only
4085; returning FSW.
4086;
4087; @param 1 The instruction
4088; @param 2 Non-zero to also restore FTW.
4089;
4090; @param A0 FPU context (fxsave).
4091; @param A1 Pointer to a uint16_t for the resulting FSW.
4092; @param A2 Pointer to the 80-bit value.
4093;
4094%macro IEMIMPL_FPU_R80_FSW 2
4095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4096 PROLOGUE_3_ARGS
4097 sub xSP, 20h
4098
4099 fninit
4100 fld tword [A2]
4101%if %2 != 0
4102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4103%else
4104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4105%endif
4106 %1
4107
4108 fnstsw word [A1]
4109
4110 fninit
4111 add xSP, 20h
4112 EPILOGUE_3_ARGS
4113ENDPROC iemAImpl_ %+ %1 %+ _r80
4114%endmacro
4115
4116IEMIMPL_FPU_R80_FSW ftst, 0
4117IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4118
4119
4120
4121;;
4122; FPU instruction loading a 80-bit floating point constant.
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 FPU context (fxsave).
4127; @param A1 Pointer to a IEMFPURESULT for the output.
4128;
4129%macro IEMIMPL_FPU_R80_CONST 1
4130BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4131 PROLOGUE_2_ARGS
4132 sub xSP, 20h
4133
4134 fninit
4135 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4136 %1
4137
4138 fnstsw word [A1 + IEMFPURESULT.FSW]
4139 fnclex
4140 fstp tword [A1 + IEMFPURESULT.r80Result]
4141
4142 fninit
4143 add xSP, 20h
4144 EPILOGUE_2_ARGS
4145ENDPROC iemAImpl_ %+ %1 %+
4146%endmacro
4147
4148IEMIMPL_FPU_R80_CONST fld1
4149IEMIMPL_FPU_R80_CONST fldl2t
4150IEMIMPL_FPU_R80_CONST fldl2e
4151IEMIMPL_FPU_R80_CONST fldpi
4152IEMIMPL_FPU_R80_CONST fldlg2
4153IEMIMPL_FPU_R80_CONST fldln2
4154IEMIMPL_FPU_R80_CONST fldz
4155
4156
4157;;
4158; FPU instruction working on one 80-bit floating point value, outputing two.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 FPU context (fxsave).
4163; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4164; @param A2 Pointer to the 80-bit value.
4165;
4166%macro IEMIMPL_FPU_R80_R80 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4168 PROLOGUE_3_ARGS
4169 sub xSP, 20h
4170
4171 fninit
4172 fld tword [A2]
4173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4174 %1
4175
4176 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4177 fnclex
4178 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4179 fnclex
4180 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4181
4182 fninit
4183 add xSP, 20h
4184 EPILOGUE_3_ARGS
4185ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4186%endmacro
4187
4188IEMIMPL_FPU_R80_R80 fptan
4189IEMIMPL_FPU_R80_R80 fxtract
4190IEMIMPL_FPU_R80_R80 fsincos
4191
4192
4193
4194
4195;---------------------- SSE and MMX Operations ----------------------
4196
4197;; @todo what do we need to do for MMX?
4198%macro IEMIMPL_MMX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_MMX_EPILOGUE 0
4201%endmacro
4202
4203;; @todo what do we need to do for SSE?
4204%macro IEMIMPL_SSE_PROLOGUE 0
4205%endmacro
4206%macro IEMIMPL_SSE_EPILOGUE 0
4207%endmacro
4208
4209;; @todo what do we need to do for AVX?
4210%macro IEMIMPL_AVX_PROLOGUE 0
4211%endmacro
4212%macro IEMIMPL_AVX_EPILOGUE 0
4213%endmacro
4214
4215
4216;;
4217; Media instruction working on two full sized registers.
4218;
4219; @param 1 The instruction
4220; @param 2 Whether there is an MMX variant (1) or not (0).
4221;
4222; @param A0 FPU context (fxsave).
4223; @param A1 Pointer to the first media register size operand (input/output).
4224; @param A2 Pointer to the second media register size operand (input).
4225;
4226; @todo r=aeichner Currently unused, can probably be removed.
4227;
4228%macro IEMIMPL_MEDIA_F2 2
4229%if %2 != 0
4230BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4231 PROLOGUE_3_ARGS
4232 IEMIMPL_MMX_PROLOGUE
4233
4234 movq mm0, [A1]
4235 movq mm1, [A2]
4236 %1 mm0, mm1
4237 movq [A1], mm0
4238
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_3_ARGS
4241ENDPROC iemAImpl_ %+ %1 %+ _u64
4242%endif
4243
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_SSE_PROLOGUE
4247
4248 movdqu xmm0, [A1]
4249 movdqu xmm1, [A2]
4250 %1 xmm0, xmm1
4251 movdqu [A1], xmm0
4252
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256%endmacro
4257
4258;;
4259; Media instruction working on two full sized registers, but no FXSAVE state argument.
4260;
4261; @param 1 The instruction
4262; @param 2 Whether there is an MMX variant (1) or not (0).
4263;
4264; @param A0 Pointer to the first media register size operand (input/output).
4265; @param A1 Pointer to the second media register size operand (input).
4266;
4267%macro IEMIMPL_MEDIA_OPT_F2 2
4268%if %2 != 0
4269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4270 PROLOGUE_2_ARGS
4271 IEMIMPL_MMX_PROLOGUE
4272
4273 movq mm0, [A0]
4274 movq mm1, [A1]
4275 %1 mm0, mm1
4276 movq [A0], mm0
4277
4278 IEMIMPL_MMX_EPILOGUE
4279 EPILOGUE_2_ARGS
4280ENDPROC iemAImpl_ %+ %1 %+ _u64
4281%endif
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4284 PROLOGUE_2_ARGS
4285 IEMIMPL_SSE_PROLOGUE
4286
4287 movdqu xmm0, [A0]
4288 movdqu xmm1, [A1]
4289 %1 xmm0, xmm1
4290 movdqu [A0], xmm0
4291
4292 IEMIMPL_SSE_EPILOGUE
4293 EPILOGUE_2_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295%endmacro
4296
4297IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4298IEMIMPL_MEDIA_OPT_F2 pand, 1
4299IEMIMPL_MEDIA_OPT_F2 pandn, 1
4300IEMIMPL_MEDIA_OPT_F2 por, 1
4301IEMIMPL_MEDIA_OPT_F2 pxor, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4305IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4306IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4307IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4308IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4309IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4310IEMIMPL_MEDIA_OPT_F2 paddb, 1
4311IEMIMPL_MEDIA_OPT_F2 paddw, 1
4312IEMIMPL_MEDIA_OPT_F2 paddd, 1
4313IEMIMPL_MEDIA_OPT_F2 paddq, 1
4314IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4315IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4316IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4317IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4318IEMIMPL_MEDIA_OPT_F2 psubb, 1
4319IEMIMPL_MEDIA_OPT_F2 psubw, 1
4320IEMIMPL_MEDIA_OPT_F2 psubd, 1
4321IEMIMPL_MEDIA_OPT_F2 psubq, 1
4322IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4323IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4324IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4325IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4326IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4327IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4328IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4330IEMIMPL_MEDIA_OPT_F2 pminub, 1
4331IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4332IEMIMPL_MEDIA_OPT_F2 pminud, 0
4333IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4334IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4335IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4336IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4337IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4338IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4339IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4340IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4341IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4342IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4343IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4345IEMIMPL_MEDIA_OPT_F2 psignb, 1
4346IEMIMPL_MEDIA_OPT_F2 psignw, 1
4347IEMIMPL_MEDIA_OPT_F2 psignd, 1
4348IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4349IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4350IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4351IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4352IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4353IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4354IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4355IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4356IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4357IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4358IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4359IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4360IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4361IEMIMPL_MEDIA_OPT_F2 psllw, 1
4362IEMIMPL_MEDIA_OPT_F2 pslld, 1
4363IEMIMPL_MEDIA_OPT_F2 psllq, 1
4364IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4365IEMIMPL_MEDIA_OPT_F2 psrld, 1
4366IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4367IEMIMPL_MEDIA_OPT_F2 psraw, 1
4368IEMIMPL_MEDIA_OPT_F2 psrad, 1
4369IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4370IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4371IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4372IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4373IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4374IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4375IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4376IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4377IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4378IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4379IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4380IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4381IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4382IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4383IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4384IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4385IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4386IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4387IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4388IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4389
4390
4391;;
4392; Media instruction working on one full sized and one half sized register (lower half).
4393;
4394; @param 1 The instruction
4395; @param 2 1 if MMX is included, 0 if not.
4396;
4397; @param A0 Pointer to the first full sized media register operand (input/output).
4398; @param A1 Pointer to the second half sized media register operand (input).
4399;
4400%macro IEMIMPL_MEDIA_F1L1 2
4401 %if %2 != 0
4402BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4403 PROLOGUE_2_ARGS
4404 IEMIMPL_MMX_PROLOGUE
4405
4406 movq mm0, [A0]
4407 movq mm1, [A1]
4408 %1 mm0, mm1
4409 movq [A0], mm0
4410
4411 IEMIMPL_MMX_EPILOGUE
4412 EPILOGUE_2_ARGS
4413ENDPROC iemAImpl_ %+ %1 %+ _u64
4414 %endif
4415
4416BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4417 PROLOGUE_2_ARGS
4418 IEMIMPL_SSE_PROLOGUE
4419
4420 movdqu xmm0, [A0]
4421 movdqu xmm1, [A1]
4422 %1 xmm0, xmm1
4423 movdqu [A0], xmm0
4424
4425 IEMIMPL_SSE_EPILOGUE
4426 EPILOGUE_2_ARGS
4427ENDPROC iemAImpl_ %+ %1 %+ _u128
4428%endmacro
4429
4430IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4431IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4432IEMIMPL_MEDIA_F1L1 punpckldq, 1
4433IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4434
4435
4436;;
4437; Media instruction working two half sized input registers (lower half) and a full sized
4438; destination register (vpunpckh*).
4439;
4440; @param 1 The instruction
4441;
4442; @param A0 Pointer to the destination register (full sized, output only).
4443; @param A1 Pointer to the first full sized media source register operand, where we
4444; will only use the lower half as input - but we'll be loading it in full.
4445; @param A2 Pointer to the second full sized media source register operand, where we
4446; will only use the lower half as input - but we'll be loading it in full.
4447;
4448%macro IEMIMPL_MEDIA_F1L1L1 1
4449BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4450 PROLOGUE_3_ARGS
4451 IEMIMPL_AVX_PROLOGUE
4452
4453 vmovdqu xmm0, [A1]
4454 vmovdqu xmm1, [A2]
4455 %1 xmm0, xmm0, xmm1
4456 vmovdqu [A0], xmm0
4457
4458 IEMIMPL_AVX_PROLOGUE
4459 EPILOGUE_3_ARGS
4460ENDPROC iemAImpl_ %+ %1 %+ _u128
4461
4462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4463 PROLOGUE_3_ARGS
4464 IEMIMPL_AVX_PROLOGUE
4465
4466 vmovdqu ymm0, [A1]
4467 vmovdqu ymm1, [A2]
4468 %1 ymm0, ymm0, ymm1
4469 vmovdqu [A0], ymm0
4470
4471 IEMIMPL_AVX_PROLOGUE
4472 EPILOGUE_3_ARGS
4473ENDPROC iemAImpl_ %+ %1 %+ _u256
4474%endmacro
4475
4476IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4477IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4478IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4479IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4480
4481
4482;;
4483; Media instruction working on one full sized and one half sized register (high half).
4484;
4485; @param 1 The instruction
4486; @param 2 1 if MMX is included, 0 if not.
4487;
4488; @param A0 Pointer to the first full sized media register operand (input/output).
4489; @param A1 Pointer to the second full sized media register operand, where we
4490; will only use the upper half as input - but we'll load it in full.
4491;
4492%macro IEMIMPL_MEDIA_F1H1 2
4493IEMIMPL_MEDIA_F1L1 %1, %2
4494%endmacro
4495
4496IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4497IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4498IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4499IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4500
4501
4502;;
4503; Media instruction working two half sized input registers (high half) and a full sized
4504; destination register (vpunpckh*).
4505;
4506; @param 1 The instruction
4507;
4508; @param A0 Pointer to the destination register (full sized, output only).
4509; @param A1 Pointer to the first full sized media source register operand, where we
4510; will only use the upper half as input - but we'll be loading it in full.
4511; @param A2 Pointer to the second full sized media source register operand, where we
4512; will only use the upper half as input - but we'll be loading it in full.
4513;
4514%macro IEMIMPL_MEDIA_F1H1H1 1
4515IEMIMPL_MEDIA_F1L1L1 %1
4516%endmacro
4517
4518IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4519IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4520IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4521IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4522
4523
4524;
4525; Shufflers with evil 8-bit immediates.
4526;
4527
4528BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4529 PROLOGUE_3_ARGS
4530 IEMIMPL_MMX_PROLOGUE
4531
4532 movzx A2, A2_8 ; must clear top bits
4533 movq mm1, [A1]
4534 movq mm0, mm0 ; paranoia!
4535 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4536 movq [A0], mm0
4537
4538 IEMIMPL_MMX_EPILOGUE
4539 EPILOGUE_3_ARGS
4540%assign bImm 0
4541%rep 256
4542.imm %+ bImm:
4543 IBT_ENDBRxx_WITHOUT_NOTRACK
4544 pshufw mm0, mm1, bImm
4545 ret
4546 %assign bImm bImm + 1
4547%endrep
4548.immEnd:
4549ENDPROC iemAImpl_pshufw_u64
4550
4551
4552%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4553BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4554 PROLOGUE_3_ARGS
4555 IEMIMPL_SSE_PROLOGUE
4556
4557 movzx A2, A2_8 ; must clear top bits
4558 movdqu xmm1, [A1]
4559 movdqu xmm0, xmm1 ; paranoia!
4560 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4561 movdqu [A0], xmm0
4562
4563 IEMIMPL_SSE_EPILOGUE
4564 EPILOGUE_3_ARGS
4565
4566 %assign bImm 0
4567 %rep 256
4568.imm %+ bImm:
4569 IBT_ENDBRxx_WITHOUT_NOTRACK
4570 %1 xmm0, xmm1, bImm
4571 ret
4572 %assign bImm bImm + 1
4573 %endrep
4574.immEnd:
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576%endmacro
4577
4578IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4579IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4580IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4581
4582
4583%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4584BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4585 PROLOGUE_3_ARGS
4586 IEMIMPL_SSE_PROLOGUE
4587
4588 movzx A2, A2_8 ; must clear top bits
4589 vmovdqu ymm1, [A1]
4590 vmovdqu ymm0, ymm1 ; paranoia!
4591 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4592 vmovdqu [A0], ymm0
4593
4594 IEMIMPL_SSE_EPILOGUE
4595 EPILOGUE_3_ARGS
4596 %assign bImm 0
4597 %rep 256
4598.imm %+ bImm:
4599 IBT_ENDBRxx_WITHOUT_NOTRACK
4600 %1 ymm0, ymm1, bImm
4601 ret
4602 %assign bImm bImm + 1
4603 %endrep
4604.immEnd:
4605ENDPROC iemAImpl_ %+ %1 %+ _u256
4606%endmacro
4607
4608IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4609IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4610IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4611
4612
4613;
4614; Shifts with evil 8-bit immediates.
4615;
4616
4617%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4619 PROLOGUE_2_ARGS
4620 IEMIMPL_MMX_PROLOGUE
4621
4622 movzx A1, A1_8 ; must clear top bits
4623 movq mm0, [A0]
4624 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4625 movq [A0], mm0
4626
4627 IEMIMPL_MMX_EPILOGUE
4628 EPILOGUE_2_ARGS
4629%assign bImm 0
4630%rep 256
4631.imm %+ bImm:
4632 IBT_ENDBRxx_WITHOUT_NOTRACK
4633 %1 mm0, bImm
4634 ret
4635 %assign bImm bImm + 1
4636%endrep
4637.immEnd:
4638ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4639%endmacro
4640
4641IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4642IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4644IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4645IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4646IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4647IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4648IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4649
4650
4651%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4653 PROLOGUE_2_ARGS
4654 IEMIMPL_SSE_PROLOGUE
4655
4656 movzx A1, A1_8 ; must clear top bits
4657 movdqu xmm0, [A0]
4658 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4659 movdqu [A0], xmm0
4660
4661 IEMIMPL_SSE_EPILOGUE
4662 EPILOGUE_2_ARGS
4663 %assign bImm 0
4664 %rep 256
4665.imm %+ bImm:
4666 IBT_ENDBRxx_WITHOUT_NOTRACK
4667 %1 xmm0, bImm
4668 ret
4669 %assign bImm bImm + 1
4670 %endrep
4671.immEnd:
4672ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4673%endmacro
4674
4675IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4676IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4678IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4680IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4681IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4682IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4683IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4684IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4685
4686
4687;
4688; Move byte mask.
4689;
4690
4691BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4692 PROLOGUE_2_ARGS
4693 IEMIMPL_MMX_PROLOGUE
4694
4695 movq mm1, [A1]
4696 pmovmskb T0, mm1
4697 mov [A0], T0
4698%ifdef RT_ARCH_X86
4699 mov dword [A0 + 4], 0
4700%endif
4701 IEMIMPL_MMX_EPILOGUE
4702 EPILOGUE_2_ARGS
4703ENDPROC iemAImpl_pmovmskb_u64
4704
4705BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4706 PROLOGUE_2_ARGS
4707 IEMIMPL_SSE_PROLOGUE
4708
4709 movdqu xmm1, [A1]
4710 pmovmskb T0, xmm1
4711 mov [A0], T0
4712%ifdef RT_ARCH_X86
4713 mov dword [A0 + 4], 0
4714%endif
4715 IEMIMPL_SSE_EPILOGUE
4716 EPILOGUE_2_ARGS
4717ENDPROC iemAImpl_pmovmskb_u128
4718
4719BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4720 PROLOGUE_2_ARGS
4721 IEMIMPL_AVX_PROLOGUE
4722
4723 vmovdqu ymm1, [A1]
4724 vpmovmskb T0, ymm1
4725 mov [A0], T0
4726%ifdef RT_ARCH_X86
4727 mov dword [A0 + 4], 0
4728%endif
4729 IEMIMPL_AVX_EPILOGUE
4730 EPILOGUE_2_ARGS
4731ENDPROC iemAImpl_vpmovmskb_u256
4732
4733
4734;;
4735; Media instruction working on two full sized source registers and one destination (AVX).
4736;
4737; @param 1 The instruction
4738;
4739; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4740; @param A1 Pointer to the destination media register size operand (output).
4741; @param A2 Pointer to the first source media register size operand (input).
4742; @param A3 Pointer to the second source media register size operand (input).
4743;
4744; @todo r=aeichner Not used right now
4745;
4746%macro IEMIMPL_MEDIA_F3 1
4747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4748 PROLOGUE_4_ARGS
4749 IEMIMPL_AVX_PROLOGUE
4750
4751 vmovdqu xmm0, [A2]
4752 vmovdqu xmm1, [A3]
4753 %1 xmm0, xmm0, xmm1
4754 vmovdqu [A1], xmm0
4755
4756 IEMIMPL_AVX_PROLOGUE
4757 EPILOGUE_4_ARGS
4758ENDPROC iemAImpl_ %+ %1 %+ _u128
4759
4760BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4761 PROLOGUE_4_ARGS
4762 IEMIMPL_AVX_PROLOGUE
4763
4764 vmovdqu ymm0, [A2]
4765 vmovdqu ymm1, [A3]
4766 %1 ymm0, ymm0, ymm1
4767 vmovdqu [A1], ymm0
4768
4769 IEMIMPL_AVX_PROLOGUE
4770 EPILOGUE_4_ARGS
4771ENDPROC iemAImpl_ %+ %1 %+ _u256
4772%endmacro
4773
4774;;
4775; Media instruction working on two full sized source registers and one destination (AVX),
4776; but no XSAVE state pointer argument.
4777;
4778; @param 1 The instruction
4779; @param 2 Flag whether to add a 256-bit variant (1) or not (0).
4780;
4781; @param A0 Pointer to the destination media register size operand (output).
4782; @param A1 Pointer to the first source media register size operand (input).
4783; @param A2 Pointer to the second source media register size operand (input).
4784;
4785%macro IEMIMPL_MEDIA_OPT_F3 2
4786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4787 PROLOGUE_3_ARGS
4788 IEMIMPL_AVX_PROLOGUE
4789
4790 vmovdqu xmm0, [A1]
4791 vmovdqu xmm1, [A2]
4792 %1 xmm0, xmm0, xmm1
4793 vmovdqu [A0], xmm0
4794
4795 IEMIMPL_AVX_PROLOGUE
4796 EPILOGUE_3_ARGS
4797ENDPROC iemAImpl_ %+ %1 %+ _u128
4798
4799 %if %2 == 1
4800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4801 PROLOGUE_3_ARGS
4802 IEMIMPL_AVX_PROLOGUE
4803
4804 vmovdqu ymm0, [A1]
4805 vmovdqu ymm1, [A2]
4806 %1 ymm0, ymm0, ymm1
4807 vmovdqu [A0], ymm0
4808
4809 IEMIMPL_AVX_PROLOGUE
4810 EPILOGUE_3_ARGS
4811ENDPROC iemAImpl_ %+ %1 %+ _u256
4812 %endif
4813%endmacro
4814
4815IEMIMPL_MEDIA_OPT_F3 vpshufb, 1
4816IEMIMPL_MEDIA_OPT_F3 vpand, 1
4817IEMIMPL_MEDIA_OPT_F3 vpminub, 1
4818IEMIMPL_MEDIA_OPT_F3 vpminuw, 1
4819IEMIMPL_MEDIA_OPT_F3 vpminud, 1
4820IEMIMPL_MEDIA_OPT_F3 vpminsb, 1
4821IEMIMPL_MEDIA_OPT_F3 vpminsw, 1
4822IEMIMPL_MEDIA_OPT_F3 vpminsd, 1
4823IEMIMPL_MEDIA_OPT_F3 vpmaxub, 1
4824IEMIMPL_MEDIA_OPT_F3 vpmaxuw, 1
4825IEMIMPL_MEDIA_OPT_F3 vpmaxud, 1
4826IEMIMPL_MEDIA_OPT_F3 vpmaxsb, 1
4827IEMIMPL_MEDIA_OPT_F3 vpmaxsw, 1
4828IEMIMPL_MEDIA_OPT_F3 vpmaxsd, 1
4829IEMIMPL_MEDIA_OPT_F3 vpandn, 1
4830IEMIMPL_MEDIA_OPT_F3 vpor, 1
4831IEMIMPL_MEDIA_OPT_F3 vpxor, 1
4832IEMIMPL_MEDIA_OPT_F3 vpcmpeqb, 1
4833IEMIMPL_MEDIA_OPT_F3 vpcmpeqw, 1
4834IEMIMPL_MEDIA_OPT_F3 vpcmpeqd, 1
4835IEMIMPL_MEDIA_OPT_F3 vpcmpeqq, 1
4836IEMIMPL_MEDIA_OPT_F3 vpcmpgtb, 1
4837IEMIMPL_MEDIA_OPT_F3 vpcmpgtw, 1
4838IEMIMPL_MEDIA_OPT_F3 vpcmpgtd, 1
4839IEMIMPL_MEDIA_OPT_F3 vpcmpgtq, 1
4840IEMIMPL_MEDIA_OPT_F3 vpaddb, 1
4841IEMIMPL_MEDIA_OPT_F3 vpaddw, 1
4842IEMIMPL_MEDIA_OPT_F3 vpaddd, 1
4843IEMIMPL_MEDIA_OPT_F3 vpaddq, 1
4844IEMIMPL_MEDIA_OPT_F3 vpsubb, 1
4845IEMIMPL_MEDIA_OPT_F3 vpsubw, 1
4846IEMIMPL_MEDIA_OPT_F3 vpsubd, 1
4847IEMIMPL_MEDIA_OPT_F3 vpsubq, 1
4848IEMIMPL_MEDIA_OPT_F3 vpacksswb, 1
4849IEMIMPL_MEDIA_OPT_F3 vpackssdw, 1
4850IEMIMPL_MEDIA_OPT_F3 vpackuswb, 1
4851IEMIMPL_MEDIA_OPT_F3 vpackusdw, 1
4852IEMIMPL_MEDIA_OPT_F3 vpmullw, 1
4853IEMIMPL_MEDIA_OPT_F3 vpmulld, 1
4854IEMIMPL_MEDIA_OPT_F3 vpmulhw, 1
4855IEMIMPL_MEDIA_OPT_F3 vpmulhuw, 1
4856IEMIMPL_MEDIA_OPT_F3 vpavgb, 1
4857IEMIMPL_MEDIA_OPT_F3 vpavgw, 1
4858IEMIMPL_MEDIA_OPT_F3 vpsignb, 1
4859IEMIMPL_MEDIA_OPT_F3 vpsignw, 1
4860IEMIMPL_MEDIA_OPT_F3 vpsignd, 1
4861IEMIMPL_MEDIA_OPT_F3 vphaddw, 1
4862IEMIMPL_MEDIA_OPT_F3 vphaddd, 1
4863IEMIMPL_MEDIA_OPT_F3 vphsubw, 1
4864IEMIMPL_MEDIA_OPT_F3 vphsubd, 1
4865IEMIMPL_MEDIA_OPT_F3 vphaddsw, 1
4866IEMIMPL_MEDIA_OPT_F3 vphsubsw, 1
4867IEMIMPL_MEDIA_OPT_F3 vpmaddubsw, 1
4868IEMIMPL_MEDIA_OPT_F3 vpmulhrsw, 1
4869IEMIMPL_MEDIA_OPT_F3 vpsadbw, 1
4870IEMIMPL_MEDIA_OPT_F3 vpmuldq, 1
4871IEMIMPL_MEDIA_OPT_F3 vpmuludq, 1
4872IEMIMPL_MEDIA_OPT_F3 vunpcklps, 1
4873IEMIMPL_MEDIA_OPT_F3 vunpcklpd, 1
4874IEMIMPL_MEDIA_OPT_F3 vunpckhps, 1
4875IEMIMPL_MEDIA_OPT_F3 vunpckhpd, 1
4876IEMIMPL_MEDIA_OPT_F3 vpsubsb, 1
4877IEMIMPL_MEDIA_OPT_F3 vpsubsw, 1
4878IEMIMPL_MEDIA_OPT_F3 vpsubusb, 1
4879IEMIMPL_MEDIA_OPT_F3 vpsubusw, 1
4880IEMIMPL_MEDIA_OPT_F3 vpaddusb, 1
4881IEMIMPL_MEDIA_OPT_F3 vpaddusw, 1
4882IEMIMPL_MEDIA_OPT_F3 vpaddsb, 1
4883IEMIMPL_MEDIA_OPT_F3 vpaddsw, 1
4884IEMIMPL_MEDIA_OPT_F3 vpermilps, 1
4885IEMIMPL_MEDIA_OPT_F3 vpermilpd, 1
4886IEMIMPL_MEDIA_OPT_F3 vpmaddwd, 1
4887IEMIMPL_MEDIA_OPT_F3 vpsrlvd, 1
4888IEMIMPL_MEDIA_OPT_F3 vpsrlvq, 1
4889IEMIMPL_MEDIA_OPT_F3 vpsravd, 1
4890IEMIMPL_MEDIA_OPT_F3 vpsllvd, 1
4891IEMIMPL_MEDIA_OPT_F3 vpsllvq, 1
4892
4893IEMIMPL_MEDIA_OPT_F3 vaesenc, 0
4894IEMIMPL_MEDIA_OPT_F3 vaesenclast, 0
4895IEMIMPL_MEDIA_OPT_F3 vaesdec, 0
4896IEMIMPL_MEDIA_OPT_F3 vaesdeclast, 0
4897
4898
4899;;
4900; VAESIMC instruction.
4901;
4902; @param A0 Pointer to the first media register size operand (output).
4903; @param A1 Pointer to the second media register size operand (input).
4904;
4905BEGINPROC_FASTCALL iemAImpl_vaesimc_u128, 8
4906 PROLOGUE_2_ARGS
4907 IEMIMPL_SSE_PROLOGUE
4908
4909 movdqu xmm0, [A0]
4910 movdqu xmm1, [A1]
4911 vaesimc xmm0, xmm1
4912 movdqu [A0], xmm0
4913
4914 IEMIMPL_SSE_EPILOGUE
4915 EPILOGUE_2_ARGS
4916ENDPROC iemAImpl_vaesimc_u128
4917
4918
4919;;
4920; VAESKEYGENASSIST instruction.
4921;
4922; @param A0 Pointer to the first media register size operand (output).
4923; @param A1 Pointer to the second media register size operand (input).
4924; @param A2 8-bit immediate for the round constant.
4925;
4926BEGINPROC_FASTCALL iemAImpl_vaeskeygenassist_u128, 16
4927 PROLOGUE_3_ARGS
4928 IEMIMPL_AVX_PROLOGUE
4929
4930 movzx A2, A2_8 ; must clear top bits
4931 movdqu xmm0, [A0]
4932 movdqu xmm1, [A1]
4933 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4934 movdqu [A0], xmm0
4935
4936 IEMIMPL_AVX_EPILOGUE
4937 EPILOGUE_3_ARGS
4938 %assign bImm 0
4939 %rep 256
4940.imm %+ bImm:
4941 IBT_ENDBRxx_WITHOUT_NOTRACK
4942 vaeskeygenassist xmm0, xmm1, bImm
4943 ret
4944 int3
4945 %assign bImm bImm + 1
4946 %endrep
4947.immEnd:
4948ENDPROC iemAImpl_vaeskeygenassist_u128
4949
4950
4951;;
4952; VPERMQ instruction.
4953;
4954; @param A0 Pointer to the first media register size operand (output).
4955; @param A1 Pointer to the second media register size operand (input).
4956; @param A2 8-bit immediate for the round constant.
4957;
4958BEGINPROC_FASTCALL iemAImpl_vpermq_u256, 16
4959 PROLOGUE_3_ARGS
4960 IEMIMPL_AVX_PROLOGUE
4961
4962 movzx A2, A2_8 ; must clear top bits
4963 vmovdqu ymm1, [A1]
4964 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4965 vmovdqu [A0], ymm0
4966
4967 IEMIMPL_AVX_EPILOGUE
4968 EPILOGUE_3_ARGS
4969 %assign bImm 0
4970 %rep 256
4971.imm %+ bImm:
4972 IBT_ENDBRxx_WITHOUT_NOTRACK
4973 vpermq ymm0, ymm1, bImm
4974 ret
4975 int3
4976 %assign bImm bImm + 1
4977 %endrep
4978.immEnd:
4979ENDPROC iemAImpl_vpermq_u256
4980
4981
4982;;
4983; VPERMPD instruction.
4984;
4985; @param A0 Pointer to the first media register size operand (output).
4986; @param A1 Pointer to the second media register size operand (input).
4987; @param A2 8-bit immediate for the round constant.
4988;
4989BEGINPROC_FASTCALL iemAImpl_vpermpd_u256, 16
4990 PROLOGUE_3_ARGS
4991 IEMIMPL_AVX_PROLOGUE
4992
4993 movzx A2, A2_8 ; must clear top bits
4994 vmovdqu ymm1, [A1]
4995 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4996 vmovdqu [A0], ymm0
4997
4998 IEMIMPL_AVX_EPILOGUE
4999 EPILOGUE_3_ARGS
5000 %assign bImm 0
5001 %rep 256
5002.imm %+ bImm:
5003 IBT_ENDBRxx_WITHOUT_NOTRACK
5004 vpermpd ymm0, ymm1, bImm
5005 ret
5006 int3
5007 %assign bImm bImm + 1
5008 %endrep
5009.immEnd:
5010ENDPROC iemAImpl_vpermpd_u256
5011
5012
5013;;
5014; VPERMPS instruction.
5015;
5016; @param A0 Pointer to the first media register size operand (output).
5017; @param A1 Pointer to the second media register size operand (input).
5018; @param A2 Pointer to the third media register size operand (input).
5019;
5020BEGINPROC_FASTCALL iemAImpl_vpermps_u256, 16
5021 PROLOGUE_3_ARGS
5022 IEMIMPL_AVX_PROLOGUE
5023
5024 vmovdqu ymm0, [A1]
5025 vmovdqu ymm1, [A2]
5026 vpermps ymm0, ymm0, ymm1
5027 vmovdqu [A0], ymm0
5028
5029 IEMIMPL_AVX_EPILOGUE
5030 EPILOGUE_3_ARGS
5031ENDPROC iemAImpl_vpermps_u256
5032
5033
5034;;
5035; VPERMD instruction.
5036;
5037; @param A0 Pointer to the first media register size operand (output).
5038; @param A1 Pointer to the second media register size operand (input).
5039; @param A2 Pointer to the third media register size operand (input).
5040;
5041BEGINPROC_FASTCALL iemAImpl_vpermd_u256, 16
5042 PROLOGUE_3_ARGS
5043 IEMIMPL_AVX_PROLOGUE
5044
5045 vmovdqu ymm0, [A1]
5046 vmovdqu ymm1, [A2]
5047 vpermd ymm0, ymm0, ymm1
5048 vmovdqu [A0], ymm0
5049
5050 IEMIMPL_AVX_EPILOGUE
5051 EPILOGUE_3_ARGS
5052ENDPROC iemAImpl_vpermd_u256
5053
5054
5055;;
5056; Media instruction working on one full sized source register, one full sized destination
5057; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
5058; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
5059; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
5060; of either 16, 32, or 64, it acts like the max shift size)
5061;
5062; @param 1 The instruction
5063;
5064; @param A0 Pointer to the destination media register size operand (output).
5065; @param A1 Pointer to the first source media register size operand (input).
5066; @param A2 Pointer to the second source media register size operand (input).
5067;
5068%macro IEMIMPL_SHIFT_OPT_F3 1
5069BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5070 PROLOGUE_3_ARGS
5071 IEMIMPL_AVX_PROLOGUE
5072
5073 vmovdqu xmm0, [A1]
5074 vmovdqu xmm1, [A2]
5075 %1 xmm0, xmm0, xmm1
5076 vmovdqu [A0], xmm0
5077
5078 IEMIMPL_AVX_PROLOGUE
5079 EPILOGUE_3_ARGS
5080ENDPROC iemAImpl_ %+ %1 %+ _u128
5081
5082BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5083 PROLOGUE_3_ARGS
5084 IEMIMPL_AVX_PROLOGUE
5085
5086 vmovdqu ymm0, [A1]
5087 vmovdqu xmm1, [A2]
5088 %1 ymm0, ymm0, xmm1
5089 vmovdqu [A0], ymm0
5090
5091 IEMIMPL_AVX_PROLOGUE
5092 EPILOGUE_3_ARGS
5093ENDPROC iemAImpl_ %+ %1 %+ _u256
5094%endmacro
5095
5096IEMIMPL_SHIFT_OPT_F3 vpsllw
5097IEMIMPL_SHIFT_OPT_F3 vpslld
5098IEMIMPL_SHIFT_OPT_F3 vpsllq
5099IEMIMPL_SHIFT_OPT_F3 vpsraw
5100IEMIMPL_SHIFT_OPT_F3 vpsrad
5101IEMIMPL_SHIFT_OPT_F3 vpsrlw
5102IEMIMPL_SHIFT_OPT_F3 vpsrld
5103IEMIMPL_SHIFT_OPT_F3 vpsrlq
5104
5105
5106;;
5107; Media instruction working on one full sized source registers and one destination (AVX),
5108; but no XSAVE state pointer argument.
5109;
5110; @param 1 The instruction
5111; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
5112;
5113; @param A0 Pointer to the destination media register size operand (output).
5114; @param A1 Pointer to the source media register size operand (input).
5115;
5116%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
5117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5118 PROLOGUE_2_ARGS
5119 IEMIMPL_AVX_PROLOGUE
5120
5121 vmovdqu xmm0, [A1]
5122 %1 xmm0, xmm0
5123 vmovdqu [A0], xmm0
5124
5125 IEMIMPL_AVX_PROLOGUE
5126 EPILOGUE_2_ARGS
5127ENDPROC iemAImpl_ %+ %1 %+ _u128
5128
5129 %if %2 == 1
5130BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5131 PROLOGUE_2_ARGS
5132 IEMIMPL_AVX_PROLOGUE
5133
5134 vmovdqu ymm0, [A1]
5135 %1 ymm0, ymm0
5136 vmovdqu [A0], ymm0
5137
5138 IEMIMPL_AVX_PROLOGUE
5139 EPILOGUE_2_ARGS
5140ENDPROC iemAImpl_ %+ %1 %+ _u256
5141 %endif
5142%endmacro
5143
5144IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
5145IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
5146IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
5147IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
5148
5149
5150;
5151; The SSE 4.2 crc32
5152;
5153; @param A1 Pointer to the 32-bit destination.
5154; @param A2 The source operand, sized according to the suffix.
5155;
5156BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
5157 PROLOGUE_2_ARGS
5158
5159 mov T0_32, [A0]
5160 crc32 T0_32, A1_8
5161 mov [A0], T0_32
5162
5163 EPILOGUE_2_ARGS
5164ENDPROC iemAImpl_crc32_u8
5165
5166BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5167 PROLOGUE_2_ARGS
5168
5169 mov T0_32, [A0]
5170 crc32 T0_32, A1_16
5171 mov [A0], T0_32
5172
5173 EPILOGUE_2_ARGS
5174ENDPROC iemAImpl_crc32_u16
5175
5176BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5177 PROLOGUE_2_ARGS
5178
5179 mov T0_32, [A0]
5180 crc32 T0_32, A1_32
5181 mov [A0], T0_32
5182
5183 EPILOGUE_2_ARGS
5184ENDPROC iemAImpl_crc32_u32
5185
5186%ifdef RT_ARCH_AMD64
5187BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5188 PROLOGUE_2_ARGS
5189
5190 mov T0_32, [A0]
5191 crc32 T0, A1
5192 mov [A0], T0_32
5193
5194 EPILOGUE_2_ARGS
5195ENDPROC iemAImpl_crc32_u64
5196%endif
5197
5198
5199;
5200; PTEST (SSE 4.1)
5201;
5202; @param A0 Pointer to the first source operand (aka readonly destination).
5203; @param A1 Pointer to the second source operand.
5204; @param A2 Pointer to the EFLAGS register.
5205;
5206BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5207 PROLOGUE_3_ARGS
5208 IEMIMPL_SSE_PROLOGUE
5209
5210 movdqu xmm0, [A0]
5211 movdqu xmm1, [A1]
5212 ptest xmm0, xmm1
5213 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5214
5215 IEMIMPL_SSE_EPILOGUE
5216 EPILOGUE_3_ARGS
5217ENDPROC iemAImpl_ptest_u128
5218
5219BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5220 PROLOGUE_3_ARGS
5221 IEMIMPL_SSE_PROLOGUE
5222
5223 vmovdqu ymm0, [A0]
5224 vmovdqu ymm1, [A1]
5225 vptest ymm0, ymm1
5226 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5227
5228 IEMIMPL_SSE_EPILOGUE
5229 EPILOGUE_3_ARGS
5230ENDPROC iemAImpl_vptest_u256
5231
5232
5233;; Template for the vtestp{s,d} instructions
5234;
5235; @param 1 The instruction
5236;
5237; @param A0 Pointer to the first source operand (aka readonly destination).
5238; @param A1 Pointer to the second source operand.
5239; @param A2 Pointer to the EFLAGS register.
5240;
5241%macro IEMIMPL_VTESTP_S_D 1
5242BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5243 PROLOGUE_3_ARGS
5244 IEMIMPL_AVX_PROLOGUE
5245
5246 vmovdqu xmm0, [A0]
5247 vmovdqu xmm1, [A1]
5248 %1 xmm0, xmm1
5249 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5250
5251 IEMIMPL_AVX_EPILOGUE
5252 EPILOGUE_3_ARGS
5253ENDPROC iemAImpl_ %+ %1 %+ _u128
5254
5255BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5256 PROLOGUE_3_ARGS
5257 IEMIMPL_AVX_PROLOGUE
5258
5259 vmovdqu ymm0, [A0]
5260 vmovdqu ymm1, [A1]
5261 %1 ymm0, ymm1
5262 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5263
5264 IEMIMPL_AVX_EPILOGUE
5265 EPILOGUE_3_ARGS
5266ENDPROC iemAImpl_ %+ %1 %+ _u256
5267%endmacro
5268
5269IEMIMPL_VTESTP_S_D vtestps
5270IEMIMPL_VTESTP_S_D vtestpd
5271
5272
5273;;
5274; Template for the [v]pmov{s,z}x* instructions
5275;
5276; @param 1 The instruction
5277;
5278; @param A0 Pointer to the destination media register size operand (output).
5279; @param A1 The source operand value (input).
5280;
5281%macro IEMIMPL_V_PMOV_SZ_X 1
5282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5283 PROLOGUE_2_ARGS
5284 IEMIMPL_SSE_PROLOGUE
5285
5286 movd xmm0, A1
5287 %1 xmm0, xmm0
5288 vmovdqu [A0], xmm0
5289
5290 IEMIMPL_SSE_PROLOGUE
5291 EPILOGUE_2_ARGS
5292ENDPROC iemAImpl_ %+ %1 %+ _u128
5293
5294BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5295 PROLOGUE_2_ARGS
5296 IEMIMPL_AVX_PROLOGUE
5297
5298 movd xmm0, A1
5299 v %+ %1 xmm0, xmm0
5300 vmovdqu [A0], xmm0
5301
5302 IEMIMPL_AVX_PROLOGUE
5303 EPILOGUE_2_ARGS
5304ENDPROC iemAImpl_v %+ %1 %+ _u128
5305
5306BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5307 PROLOGUE_2_ARGS
5308 IEMIMPL_AVX_PROLOGUE
5309
5310 movdqu xmm0, [A1]
5311 v %+ %1 ymm0, xmm0
5312 vmovdqu [A0], ymm0
5313
5314 IEMIMPL_AVX_PROLOGUE
5315 EPILOGUE_2_ARGS
5316ENDPROC iemAImpl_v %+ %1 %+ _u256
5317%endmacro
5318
5319IEMIMPL_V_PMOV_SZ_X pmovsxbw
5320IEMIMPL_V_PMOV_SZ_X pmovsxbd
5321IEMIMPL_V_PMOV_SZ_X pmovsxbq
5322IEMIMPL_V_PMOV_SZ_X pmovsxwd
5323IEMIMPL_V_PMOV_SZ_X pmovsxwq
5324IEMIMPL_V_PMOV_SZ_X pmovsxdq
5325
5326IEMIMPL_V_PMOV_SZ_X pmovzxbw
5327IEMIMPL_V_PMOV_SZ_X pmovzxbd
5328IEMIMPL_V_PMOV_SZ_X pmovzxbq
5329IEMIMPL_V_PMOV_SZ_X pmovzxwd
5330IEMIMPL_V_PMOV_SZ_X pmovzxwq
5331IEMIMPL_V_PMOV_SZ_X pmovzxdq
5332
5333
5334;;
5335; Initialize the SSE MXCSR register using the guest value partially to
5336; account for rounding mode, load the value from the given register.
5337;
5338; @uses 4 bytes of stack to save the original value, T0.
5339; @param 1 Expression giving the register holding the guest's MXCSR.
5340;
5341%macro SSE_AVX_LD_MXCSR 1
5342 sub xSP, 4
5343
5344 stmxcsr [xSP]
5345 mov T0_32, %1
5346 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5347 or T0_32, X86_MXCSR_XCPT_MASK
5348 sub xSP, 4
5349 mov [xSP], T0_32
5350 ldmxcsr [xSP]
5351 add xSP, 4
5352%endmacro
5353
5354
5355;;
5356; Restores the SSE MXCSR register with the original value.
5357;
5358; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5359; @param 1 Expression giving the register to return the new guest's MXCSR value.
5360; @param 2 Expression giving the register holding original guest's MXCSR value.
5361;
5362; @note Restores the stack pointer.
5363;
5364%macro SSE_AVX_ST_MXCSR 2
5365 sub xSP, 4
5366 stmxcsr [xSP]
5367 mov %1, [xSP]
5368 add xSP, 4
5369 ; Merge the status bits into the original MXCSR value.
5370 and %1, X86_MXCSR_XCPT_FLAGS
5371 or %1, %2
5372
5373 ldmxcsr [xSP]
5374 add xSP, 4
5375%endmacro
5376
5377
5378;;
5379; Floating point instruction working on two full sized registers.
5380;
5381; @param 1 The instruction
5382; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5383;
5384; @returns R0_32 The new MXCSR value of the guest.
5385; @param A0 The guest's MXCSR register value to use.
5386; @param A1 Where to return the result.
5387; @param A2 Pointer to the first media register size operand (input/output).
5388; @param A3 Pointer to the second media register size operand (input).
5389;
5390%macro IEMIMPL_FP_F2 2
5391BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5392 PROLOGUE_4_ARGS
5393 IEMIMPL_SSE_PROLOGUE
5394 SSE_AVX_LD_MXCSR A0_32
5395
5396 movdqu xmm0, [A2]
5397 movdqu xmm1, [A3]
5398 %1 xmm0, xmm1
5399 movdqu [A1], xmm0
5400
5401 SSE_AVX_ST_MXCSR R0_32, A0_32
5402 IEMIMPL_SSE_PROLOGUE
5403 EPILOGUE_4_ARGS
5404ENDPROC iemAImpl_ %+ %1 %+ _u128
5405
5406 %if %2 == 3
5407BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5408 PROLOGUE_4_ARGS
5409 IEMIMPL_AVX_PROLOGUE
5410 SSE_AVX_LD_MXCSR A0_32
5411
5412 vmovdqu xmm0, [A2]
5413 vmovdqu xmm1, [A3]
5414 v %+ %1 xmm0, xmm0, xmm1
5415 vmovdqu [A1], xmm0
5416
5417 SSE_AVX_ST_MXCSR R0_32, A0_32
5418 IEMIMPL_AVX_PROLOGUE
5419 EPILOGUE_4_ARGS
5420ENDPROC iemAImpl_v %+ %1 %+ _u128
5421
5422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5423 PROLOGUE_4_ARGS
5424 IEMIMPL_AVX_PROLOGUE
5425 SSE_AVX_LD_MXCSR A0_32
5426
5427 vmovdqu ymm0, [A2]
5428 vmovdqu ymm1, [A3]
5429 v %+ %1 ymm0, ymm0, ymm1
5430 vmovdqu [A1], ymm0
5431
5432 SSE_AVX_ST_MXCSR R0_32, A0_32
5433 IEMIMPL_AVX_PROLOGUE
5434 EPILOGUE_4_ARGS
5435ENDPROC iemAImpl_v %+ %1 %+ _u256
5436 %elif %2 == 2
5437BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5438 PROLOGUE_4_ARGS
5439 IEMIMPL_AVX_PROLOGUE
5440 SSE_AVX_LD_MXCSR A0_32
5441
5442 vmovdqu xmm0, [A2]
5443 vmovdqu xmm1, [A3]
5444 v %+ %1 xmm0, xmm1
5445 vmovdqu [A1], xmm0
5446
5447 SSE_AVX_ST_MXCSR R0_32, A0_32
5448 IEMIMPL_AVX_PROLOGUE
5449 EPILOGUE_4_ARGS
5450ENDPROC iemAImpl_v %+ %1 %+ _u128
5451
5452BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5453 PROLOGUE_4_ARGS
5454 IEMIMPL_AVX_PROLOGUE
5455 SSE_AVX_LD_MXCSR A0_32
5456
5457 vmovdqu ymm0, [A2]
5458 vmovdqu ymm1, [A3]
5459 v %+ %1 ymm0, ymm1
5460 vmovdqu [A1], ymm0
5461
5462 SSE_AVX_ST_MXCSR R0_32, A0_32
5463 IEMIMPL_AVX_PROLOGUE
5464 EPILOGUE_4_ARGS
5465ENDPROC iemAImpl_v %+ %1 %+ _u256
5466 %endif
5467%endmacro
5468
5469IEMIMPL_FP_F2 addps, 3
5470IEMIMPL_FP_F2 addpd, 3
5471IEMIMPL_FP_F2 mulps, 3
5472IEMIMPL_FP_F2 mulpd, 3
5473IEMIMPL_FP_F2 subps, 3
5474IEMIMPL_FP_F2 subpd, 3
5475IEMIMPL_FP_F2 minps, 3
5476IEMIMPL_FP_F2 minpd, 3
5477IEMIMPL_FP_F2 divps, 3
5478IEMIMPL_FP_F2 divpd, 3
5479IEMIMPL_FP_F2 maxps, 3
5480IEMIMPL_FP_F2 maxpd, 3
5481IEMIMPL_FP_F2 haddps, 3
5482IEMIMPL_FP_F2 haddpd, 3
5483IEMIMPL_FP_F2 hsubps, 3
5484IEMIMPL_FP_F2 hsubpd, 3
5485IEMIMPL_FP_F2 addsubps, 3
5486IEMIMPL_FP_F2 addsubpd, 3
5487
5488
5489;;
5490; These are actually unary operations but to keep it simple
5491; we treat them as binary for now, so the output result is
5492; always in sync with the register where the result might get written
5493; to.
5494IEMIMPL_FP_F2 sqrtps, 2
5495IEMIMPL_FP_F2 rsqrtps, 2
5496IEMIMPL_FP_F2 sqrtpd, 2
5497IEMIMPL_FP_F2 rcpps, 2
5498IEMIMPL_FP_F2 cvtdq2ps, 2
5499IEMIMPL_FP_F2 cvtps2dq, 2
5500IEMIMPL_FP_F2 cvttps2dq, 2
5501IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5502
5503
5504;;
5505; Floating point instruction working on a full sized register and a single precision operand.
5506;
5507; @param 1 The instruction
5508;
5509; @return R0_32 The new MXCSR value of the guest.
5510; @param A0 The guest's MXCSR register value to use.
5511; @param A1 Where to return the result.
5512; @param A2 Pointer to the first media register size operand (input/output).
5513; @param A3 Pointer to the second single precision floating point value (input).
5514;
5515%macro IEMIMPL_FP_F2_R32 1
5516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5517 PROLOGUE_4_ARGS
5518 IEMIMPL_SSE_PROLOGUE
5519 SSE_AVX_LD_MXCSR A0_32
5520
5521 movdqu xmm0, [A2]
5522 movd xmm1, [A3]
5523 %1 xmm0, xmm1
5524 movdqu [A1], xmm0
5525
5526 SSE_AVX_ST_MXCSR R0_32, A0_32
5527 IEMIMPL_SSE_EPILOGUE
5528 EPILOGUE_4_ARGS
5529ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5530
5531BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5532 PROLOGUE_4_ARGS
5533 IEMIMPL_AVX_PROLOGUE
5534 SSE_AVX_LD_MXCSR A0_32
5535
5536 vmovdqu xmm0, [A2]
5537 vmovd xmm1, [A3]
5538 v %+ %1 xmm0, xmm0, xmm1
5539 vmovdqu [A1], xmm0
5540
5541 SSE_AVX_ST_MXCSR R0_32, A0_32
5542 IEMIMPL_AVX_PROLOGUE
5543 EPILOGUE_4_ARGS
5544ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5545%endmacro
5546
5547IEMIMPL_FP_F2_R32 addss
5548IEMIMPL_FP_F2_R32 mulss
5549IEMIMPL_FP_F2_R32 subss
5550IEMIMPL_FP_F2_R32 minss
5551IEMIMPL_FP_F2_R32 divss
5552IEMIMPL_FP_F2_R32 maxss
5553IEMIMPL_FP_F2_R32 cvtss2sd
5554IEMIMPL_FP_F2_R32 sqrtss
5555IEMIMPL_FP_F2_R32 rsqrtss
5556IEMIMPL_FP_F2_R32 rcpss
5557
5558
5559;;
5560; Floating point instruction working on a full sized register and a double precision operand.
5561;
5562; @param 1 The instruction
5563;
5564; @return R0_32 The new MXCSR value of the guest.
5565; @param A0 The guest's MXCSR register value to use.
5566; @param A1 Where to return the result.
5567; @param A2 Pointer to the first media register size operand (input/output).
5568; @param A3 Pointer to the second double precision floating point value (input).
5569;
5570%macro IEMIMPL_FP_F2_R64 1
5571BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5572 PROLOGUE_4_ARGS
5573 IEMIMPL_SSE_PROLOGUE
5574 SSE_AVX_LD_MXCSR A0_32
5575
5576 movdqu xmm0, [A2]
5577 movq xmm1, [A3]
5578 %1 xmm0, xmm1
5579 movdqu [A1], xmm0
5580
5581 SSE_AVX_ST_MXCSR R0_32, A0_32
5582 IEMIMPL_SSE_EPILOGUE
5583 EPILOGUE_4_ARGS
5584ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5585
5586BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5587 PROLOGUE_4_ARGS
5588 IEMIMPL_AVX_PROLOGUE
5589 SSE_AVX_LD_MXCSR A0_32
5590
5591 vmovdqu xmm0, [A2]
5592 vmovq xmm1, [A3]
5593 v %+ %1 xmm0, xmm0, xmm1
5594 vmovdqu [A1], xmm0
5595
5596 SSE_AVX_ST_MXCSR R0_32, A0_32
5597 IEMIMPL_AVX_EPILOGUE
5598 EPILOGUE_4_ARGS
5599ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5600%endmacro
5601
5602IEMIMPL_FP_F2_R64 addsd
5603IEMIMPL_FP_F2_R64 mulsd
5604IEMIMPL_FP_F2_R64 subsd
5605IEMIMPL_FP_F2_R64 minsd
5606IEMIMPL_FP_F2_R64 divsd
5607IEMIMPL_FP_F2_R64 maxsd
5608IEMIMPL_FP_F2_R64 cvtsd2ss
5609IEMIMPL_FP_F2_R64 sqrtsd
5610
5611
5612;;
5613; Macro for the cvtpd2ps/cvtps2pd instructions.
5614;
5615; 1 The instruction name.
5616; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5617;
5618; @return R0_32 The new MXCSR value of the guest.
5619; @param A0_32 The guest's MXCSR register value to use.
5620; @param A1 Where to return the result.
5621; @param A2 Pointer to the first media register size operand (input/output).
5622; @param A3 Pointer to the second media register size operand (input).
5623;
5624%macro IEMIMPL_CVT_F2 2
5625BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5626 PROLOGUE_4_ARGS
5627 IEMIMPL_SSE_PROLOGUE
5628 SSE_AVX_LD_MXCSR A0_32
5629
5630 movdqu xmm0, [A2]
5631 movdqu xmm1, [A3]
5632 %1 xmm0, xmm1
5633 movdqu [A1], xmm0
5634
5635 SSE_AVX_ST_MXCSR R0_32, A0_32
5636 IEMIMPL_SSE_EPILOGUE
5637 EPILOGUE_4_ARGS
5638ENDPROC iemAImpl_ %+ %1 %+ _u128
5639
5640BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u128, 16
5641 PROLOGUE_4_ARGS
5642 IEMIMPL_AVX_PROLOGUE
5643 SSE_AVX_LD_MXCSR A0_32
5644
5645 vmovdqu xmm1, [A2]
5646 v %+ %1 xmm0, xmm1
5647 vmovdqu [A1], xmm0
5648
5649 SSE_AVX_ST_MXCSR R0_32, A0_32
5650 IEMIMPL_AVX_EPILOGUE
5651 EPILOGUE_4_ARGS
5652ENDPROC iemAImpl_v %+ %1 %+ _u128_u128
5653
5654BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u256, 16
5655 PROLOGUE_4_ARGS
5656 IEMIMPL_AVX_PROLOGUE
5657 SSE_AVX_LD_MXCSR A0_32
5658
5659 vmovdqu xmm1, [A2]
5660 %if %2 == 0
5661 v %+ %1 xmm0, xmm1
5662 %else
5663 v %+ %1 ymm0, xmm1
5664 %endif
5665 vmovdqu [A1], ymm0
5666
5667 SSE_AVX_ST_MXCSR R0_32, A0_32
5668 IEMIMPL_AVX_EPILOGUE
5669 EPILOGUE_4_ARGS
5670ENDPROC iemAImpl_v %+ %1 %+ _u128_u256
5671%endmacro
5672
5673IEMIMPL_CVT_F2 cvtpd2ps, 0
5674IEMIMPL_CVT_F2 cvttpd2dq, 0
5675IEMIMPL_CVT_F2 cvtpd2dq, 0
5676
5677;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5678
5679BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5680 PROLOGUE_3_ARGS
5681 IEMIMPL_SSE_PROLOGUE
5682 SSE_AVX_LD_MXCSR A0_32
5683
5684 cvtps2pd xmm0, [A2]
5685 movdqu [A1], xmm0
5686
5687 SSE_AVX_ST_MXCSR R0_32, A0_32
5688 IEMIMPL_SSE_EPILOGUE
5689 EPILOGUE_3_ARGS
5690ENDPROC iemAImpl_cvtps2pd_u128
5691
5692
5693;;
5694; vcvtps2pd instruction - 128-bit variant.
5695;
5696; @return R0_32 The new MXCSR value of the guest.
5697; @param A0_32 The guest's MXCSR register value to use.
5698; @param A1 Pointer to the result operand (output).
5699; @param A2 Pointer to the second operand (input).
5700;
5701BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u128_u64, 16
5702 PROLOGUE_3_ARGS
5703 IEMIMPL_AVX_PROLOGUE
5704 SSE_AVX_LD_MXCSR A0_32
5705
5706 vcvtps2pd xmm0, qword [A2]
5707 movdqu [A1], xmm0
5708
5709 SSE_AVX_ST_MXCSR R0_32, A0_32
5710 IEMIMPL_AVX_EPILOGUE
5711 EPILOGUE_3_ARGS
5712ENDPROC iemAImpl_vcvtps2pd_u128_u64
5713
5714
5715;;
5716; vcvtps2pd instruction - 256-bit variant.
5717;
5718; @return R0_32 The new MXCSR value of the guest.
5719; @param A0_32 The guest's MXCSR register value to use.
5720; @param A1 Pointer to the result operand (output).
5721; @param A2 Pointer to the second operand (input).
5722;
5723BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u256_u128, 16
5724 PROLOGUE_3_ARGS
5725 IEMIMPL_AVX_PROLOGUE
5726 SSE_AVX_LD_MXCSR A0_32
5727
5728 movdqu xmm0, [A2]
5729 vcvtps2pd ymm0, xmm1
5730 vmovdqu [A1], ymm0
5731
5732 SSE_AVX_ST_MXCSR R0_32, A0_32
5733 IEMIMPL_AVX_EPILOGUE
5734 EPILOGUE_3_ARGS
5735ENDPROC iemAImpl_vcvtps2pd_u256_u128
5736
5737
5738;;
5739; shufps instructions with 8-bit immediates.
5740;
5741; @param A0 Pointer to the destination media register size operand (input/output).
5742; @param A1 Pointer to the first source media register size operand (input).
5743; @param A2 The 8-bit immediate
5744;
5745BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5746 PROLOGUE_3_ARGS
5747 IEMIMPL_SSE_PROLOGUE
5748
5749 movzx A2, A2_8 ; must clear top bits
5750 movdqu xmm0, [A0]
5751 movdqu xmm1, [A1]
5752 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5753 movdqu [A0], xmm0
5754
5755 IEMIMPL_SSE_EPILOGUE
5756 EPILOGUE_3_ARGS
5757 %assign bImm 0
5758 %rep 256
5759.imm %+ bImm:
5760 IBT_ENDBRxx_WITHOUT_NOTRACK
5761 shufps xmm0, xmm1, bImm
5762 ret
5763 int3
5764 %assign bImm bImm + 1
5765 %endrep
5766.immEnd:
5767ENDPROC iemAImpl_shufps_u128
5768
5769
5770;;
5771; shufpd instruction with 8-bit immediates.
5772;
5773; @param A0 Pointer to the destination media register size operand (input/output).
5774; @param A1 Pointer to the first source media register size operand (input).
5775; @param A2 The 8-bit immediate
5776;
5777BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5778 PROLOGUE_3_ARGS
5779 IEMIMPL_SSE_PROLOGUE
5780
5781 movzx A2, A2_8 ; must clear top bits
5782 movdqu xmm0, [A0]
5783 movdqu xmm1, [A1]
5784 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5785 movdqu [A0], xmm0
5786
5787 IEMIMPL_SSE_EPILOGUE
5788 EPILOGUE_3_ARGS
5789 %assign bImm 0
5790 %rep 256
5791.imm %+ bImm:
5792 IBT_ENDBRxx_WITHOUT_NOTRACK
5793 shufpd xmm0, xmm1, bImm
5794 ret
5795 %assign bImm bImm + 1
5796 %endrep
5797.immEnd:
5798ENDPROC iemAImpl_shufpd_u128
5799
5800
5801;;
5802; vshufp{s,d} instructions with 8-bit immediates.
5803;
5804; @param 1 The instruction name.
5805;
5806; @param A0 Pointer to the destination media register size operand (output).
5807; @param A1 Pointer to the first source media register size operand (input).
5808; @param A2 Pointer to the second source media register size operand (input).
5809; @param A3 The 8-bit immediate
5810;
5811%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5812BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5813 PROLOGUE_4_ARGS
5814 IEMIMPL_AVX_PROLOGUE
5815
5816 movzx A3, A3_8 ; must clear top bits
5817 movdqu xmm0, [A1]
5818 movdqu xmm1, [A2]
5819 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5820 movdqu [A0], xmm0
5821
5822 IEMIMPL_AVX_EPILOGUE
5823 EPILOGUE_4_ARGS
5824 %assign bImm 0
5825 %rep 256
5826.imm %+ bImm:
5827 IBT_ENDBRxx_WITHOUT_NOTRACK
5828 %1 xmm0, xmm0, xmm1, bImm
5829 ret
5830 %assign bImm bImm + 1
5831 %endrep
5832.immEnd:
5833ENDPROC iemAImpl_ %+ %1 %+ _u128
5834
5835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5836 PROLOGUE_4_ARGS
5837 IEMIMPL_AVX_PROLOGUE
5838
5839 movzx A3, A3_8 ; must clear top bits
5840 vmovdqu ymm0, [A1]
5841 vmovdqu ymm1, [A2]
5842 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5843 vmovdqu [A0], ymm0
5844
5845 IEMIMPL_AVX_EPILOGUE
5846 EPILOGUE_4_ARGS
5847 %assign bImm 0
5848 %rep 256
5849.imm %+ bImm:
5850 IBT_ENDBRxx_WITHOUT_NOTRACK
5851 %1 ymm0, ymm0, ymm1, bImm
5852 ret
5853 %assign bImm bImm + 1
5854 %endrep
5855.immEnd:
5856ENDPROC iemAImpl_ %+ %1 %+ _u256
5857%endmacro
5858
5859IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5860IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5861
5862
5863;;
5864; One of the [p]blendv{b,ps,pd} variants
5865;
5866; @param 1 The instruction
5867;
5868; @param A0 Pointer to the first media register sized operand (input/output).
5869; @param A1 Pointer to the second media sized value (input).
5870; @param A2 Pointer to the media register sized mask value (input).
5871;
5872%macro IEMIMPL_P_BLEND 1
5873BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5874 PROLOGUE_3_ARGS
5875 IEMIMPL_SSE_PROLOGUE
5876
5877 movdqu xmm0, [A2] ; This is implicit
5878 movdqu xmm1, [A0]
5879 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5880 %1 xmm1, xmm2
5881 movdqu [A0], xmm1
5882
5883 IEMIMPL_SSE_PROLOGUE
5884 EPILOGUE_3_ARGS
5885ENDPROC iemAImpl_ %+ %1 %+ _u128
5886%endmacro
5887
5888IEMIMPL_P_BLEND pblendvb
5889IEMIMPL_P_BLEND blendvps
5890IEMIMPL_P_BLEND blendvpd
5891
5892
5893;;
5894; One of the v[p]blendv{b,ps,pd} variants
5895;
5896; @param 1 The instruction
5897;
5898; @param A0 Pointer to the first media register sized operand (output).
5899; @param A1 Pointer to the first media register sized operand (input).
5900; @param A2 Pointer to the second media register sized operand (input).
5901; @param A3 Pointer to the media register sized mask value (input).
5902%macro IEMIMPL_AVX_P_BLEND 1
5903BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5904 PROLOGUE_4_ARGS
5905 IEMIMPL_AVX_PROLOGUE
5906
5907 vmovdqu xmm0, [A1]
5908 vmovdqu xmm1, [A2]
5909 vmovdqu xmm2, [A3]
5910 %1 xmm0, xmm0, xmm1, xmm2
5911 vmovdqu [A0], xmm0
5912
5913 IEMIMPL_AVX_PROLOGUE
5914 EPILOGUE_4_ARGS
5915ENDPROC iemAImpl_ %+ %1 %+ _u128
5916
5917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5918 PROLOGUE_4_ARGS
5919 IEMIMPL_AVX_PROLOGUE
5920
5921 vmovdqu ymm0, [A1]
5922 vmovdqu ymm1, [A2]
5923 vmovdqu ymm2, [A3]
5924 %1 ymm0, ymm0, ymm1, ymm2
5925 vmovdqu [A0], ymm0
5926
5927 IEMIMPL_AVX_PROLOGUE
5928 EPILOGUE_4_ARGS
5929ENDPROC iemAImpl_ %+ %1 %+ _u256
5930%endmacro
5931
5932IEMIMPL_AVX_P_BLEND vpblendvb
5933IEMIMPL_AVX_P_BLEND vblendvps
5934IEMIMPL_AVX_P_BLEND vblendvpd
5935
5936
5937;;
5938; palignr mm1, mm2/m64 instruction.
5939;
5940; @param A0 Pointer to the first media register sized operand (output).
5941; @param A1 The second register sized operand (input).
5942; @param A2 The 8-bit immediate.
5943BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5944 PROLOGUE_3_ARGS
5945 IEMIMPL_MMX_PROLOGUE
5946
5947 movzx A2, A2_8 ; must clear top bits
5948 movq mm0, [A0]
5949 movq mm1, A1
5950 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5951 movq [A0], mm0
5952
5953 IEMIMPL_MMX_EPILOGUE
5954 EPILOGUE_3_ARGS
5955 %assign bImm 0
5956 %rep 256
5957.imm %+ bImm:
5958 IBT_ENDBRxx_WITHOUT_NOTRACK
5959 palignr mm0, mm1, bImm
5960 ret
5961 %assign bImm bImm + 1
5962 %endrep
5963.immEnd:
5964ENDPROC iemAImpl_palignr_u64
5965
5966
5967;;
5968; SSE instructions with 8-bit immediates of the form
5969; xxx xmm1, xmm2, imm8.
5970; where the instruction encoding takes up 6 bytes.
5971;
5972; @param 1 The instruction name.
5973;
5974; @param A0 Pointer to the first media register size operand (input/output).
5975; @param A1 Pointer to the second source media register size operand (input).
5976; @param A2 The 8-bit immediate
5977;
5978%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5980 PROLOGUE_3_ARGS
5981 IEMIMPL_SSE_PROLOGUE
5982
5983 movzx A2, A2_8 ; must clear top bits
5984 movdqu xmm0, [A0]
5985 movdqu xmm1, [A1]
5986 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
5987 movdqu [A0], xmm0
5988
5989 IEMIMPL_SSE_EPILOGUE
5990 EPILOGUE_3_ARGS
5991 %assign bImm 0
5992 %rep 256
5993.imm %+ bImm:
5994 IBT_ENDBRxx_WITHOUT_NOTRACK
5995 %1 xmm0, xmm1, bImm
5996 ret
5997 int3
5998 %assign bImm bImm + 1
5999 %endrep
6000.immEnd:
6001ENDPROC iemAImpl_ %+ %1 %+ _u128
6002%endmacro
6003
6004IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
6005IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
6006IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
6007IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
6008IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
6009IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
6010IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
6011
6012
6013;;
6014; AVX instructions with 8-bit immediates of the form
6015; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
6016; where the instruction encoding takes up 6 bytes.
6017;
6018; @param 1 The instruction name.
6019; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6020; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6021;
6022; @param A0 Pointer to the destination media register size operand (output).
6023; @param A1 Pointer to the first source media register size operand (input).
6024; @param A2 Pointer to the second source media register size operand (input).
6025; @param A3 The 8-bit immediate
6026;
6027%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
6028 %if %2 == 1
6029BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6030 PROLOGUE_4_ARGS
6031 IEMIMPL_AVX_PROLOGUE
6032
6033 movzx A3, A3_8 ; must clear top bits
6034 movdqu xmm0, [A1]
6035 movdqu xmm1, [A2]
6036 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6037 movdqu [A0], xmm0
6038
6039 IEMIMPL_AVX_EPILOGUE
6040 EPILOGUE_4_ARGS
6041 %assign bImm 0
6042 %rep 256
6043.imm %+ bImm:
6044 IBT_ENDBRxx_WITHOUT_NOTRACK
6045 %1 xmm0, xmm0, xmm1, bImm
6046 ret
6047 int3
6048 %assign bImm bImm + 1
6049 %endrep
6050.immEnd:
6051ENDPROC iemAImpl_ %+ %1 %+ _u128
6052 %endif
6053
6054 %if %3 == 1
6055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
6056 PROLOGUE_4_ARGS
6057 IEMIMPL_AVX_PROLOGUE
6058
6059 movzx A3, A3_8 ; must clear top bits
6060 vmovdqu ymm0, [A1]
6061 vmovdqu ymm1, [A2]
6062 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6063 vmovdqu [A0], ymm0
6064
6065 IEMIMPL_AVX_EPILOGUE
6066 EPILOGUE_4_ARGS
6067 %assign bImm 0
6068 %rep 256
6069.imm %+ bImm:
6070 IBT_ENDBRxx_WITHOUT_NOTRACK
6071 %1 ymm0, ymm0, ymm1, bImm
6072 ret
6073 int3
6074 %assign bImm bImm + 1
6075 %endrep
6076.immEnd:
6077ENDPROC iemAImpl_ %+ %1 %+ _u256
6078 %endif
6079%endmacro
6080
6081IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
6082IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
6083IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
6084IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
6085IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
6086IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
6087IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
6088IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
6089IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
6090
6091
6092;;
6093; AVX instructions with 8-bit immediates of the form
6094; xxx {x,y}mm1, {x,y}mm2, imm8.
6095; where the instruction encoding takes up 6 bytes.
6096;
6097; @param 1 The instruction name.
6098; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6099; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6100; @param 4 The number of bytes taken up by a single instance of the instruction.
6101;
6102; @param A0 Pointer to the destination media register size operand (output).
6103; @param A1 Pointer to the first source media register size operand (input).
6104; @param A2 The 8-bit immediate
6105;
6106%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
6107 %if %2 == 1
6108BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
6109 PROLOGUE_4_ARGS
6110 IEMIMPL_AVX_PROLOGUE
6111
6112 movzx A2, A2_8 ; must clear top bits
6113 movdqu xmm1, [A1]
6114 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6115 movdqu [A0], xmm0
6116
6117 IEMIMPL_AVX_EPILOGUE
6118 EPILOGUE_4_ARGS
6119 %assign bImm 0
6120 %rep 256
6121.imm %+ bImm:
6122 IBT_ENDBRxx_WITHOUT_NOTRACK
6123 %1 xmm0, xmm1, bImm
6124 ret
6125 int3
6126 %assign bImm bImm + 1
6127 %endrep
6128.immEnd:
6129ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
6130 %endif
6131
6132 %if %3 == 1
6133BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
6134 PROLOGUE_4_ARGS
6135 IEMIMPL_AVX_PROLOGUE
6136
6137 movzx A2, A2_8 ; must clear top bits
6138 vmovdqu ymm1, [A1]
6139 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6140 vmovdqu [A0], ymm0
6141
6142 IEMIMPL_AVX_EPILOGUE
6143 EPILOGUE_4_ARGS
6144 %assign bImm 0
6145 %rep 256
6146.imm %+ bImm:
6147 IBT_ENDBRxx_WITHOUT_NOTRACK
6148 %1 ymm0, ymm1, bImm
6149 ret
6150 int3
6151 %assign bImm bImm + 1
6152 %endrep
6153.immEnd:
6154ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
6155 %endif
6156%endmacro
6157
6158IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
6159IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
6160IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
6161IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
6162
6163
6164;;
6165; Need to move this as well somewhere better?
6166;
6167struc IEMPCMPISTRXSRC
6168 .uSrc1 resd 4
6169 .uSrc2 resd 4
6170endstruc
6171
6172struc IEMPCMPESTRXSRC
6173 .uSrc1 resd 4
6174 .uSrc2 resd 4
6175 .u64Rax resd 2
6176 .u64Rdx resd 2
6177endstruc
6178
6179;;
6180; The pcmpistri/vcmpistri instruction.
6181;
6182; @param 1 The instruction name
6183;
6184; @return R0_32 The new ECX value.
6185; @param A0 Pointer to the EFLAGS register.
6186; @param A1 Pointer to the first operand (input).
6187; @param A2 Pointer to the second operand (input).
6188; @param A3 The 8-bit immediate
6189;
6190%macro IEMIMPL_MEDIA_V_CMPISTRI 1
6191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6192 PROLOGUE_4_ARGS
6193 IEMIMPL_SSE_PROLOGUE
6194
6195 movzx A3, A3_8 ; must clear top bits
6196 movdqu xmm0, [A1]
6197 movdqu xmm1, [A2]
6198 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6199 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6200
6201 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6202 mov R0_32, ecx
6203
6204 IEMIMPL_SSE_EPILOGUE
6205 EPILOGUE_4_ARGS
6206 %assign bImm 0
6207 %rep 256
6208.imm %+ bImm:
6209 IBT_ENDBRxx_WITHOUT_NOTRACK
6210 %1 xmm0, xmm1, bImm
6211 ret
6212 int3
6213 %assign bImm bImm + 1
6214 %endrep
6215.immEnd:
6216ENDPROC iemAImpl_ %+ %1 %+ _u128
6217%endmacro
6218
6219IEMIMPL_MEDIA_V_CMPISTRI pcmpistri
6220IEMIMPL_MEDIA_V_CMPISTRI vpcmpistri
6221
6222
6223;;
6224; The pcmpestri instruction.
6225;
6226; @param 1 The instruction name
6227;
6228; @param A0 Pointer to the ECX register to store the result to (output).
6229; @param A1 Pointer to the EFLAGS register.
6230; @param A2 Pointer to the structure containing the source operands (input).
6231; @param A3 The 8-bit immediate
6232;
6233BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6234 PROLOGUE_4_ARGS
6235 IEMIMPL_SSE_PROLOGUE
6236
6237 movzx A3, A3_8 ; must clear top bits
6238 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6239 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6240 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6241 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6242 push xDX ; xDX can be A1 or A2 depending on the calling convention
6243 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6244 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6245 IBT_NOTRACK
6246 call T1
6247
6248 pop xDX
6249 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6250 mov [T2], ecx
6251
6252 IEMIMPL_SSE_EPILOGUE
6253 EPILOGUE_4_ARGS
6254 %assign bImm 0
6255 %rep 256
6256.imm %+ bImm:
6257 IBT_ENDBRxx_WITHOUT_NOTRACK
6258 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6259 pcmpestri xmm0, xmm1, bImm
6260 ret
6261 %assign bImm bImm + 1
6262 %endrep
6263.immEnd:
6264ENDPROC iemAImpl_pcmpestri_u128
6265
6266
6267;;
6268; The vpcmpestri instruction.
6269;
6270; @param 1 The instruction name
6271;
6272; @param A0 Pointer to the ECX register to store the result to (output).
6273; @param A1 Pointer to the EFLAGS register.
6274; @param A2 Pointer to the structure containing the source operands (input).
6275; @param A3 The 8-bit immediate
6276;
6277BEGINPROC_FASTCALL iemAImpl_vpcmpestri_u128, 16
6278 PROLOGUE_4_ARGS
6279 IEMIMPL_SSE_PROLOGUE
6280
6281 movzx A3, A3_8 ; must clear top bits
6282 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6283 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6284 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6285 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6286 push xDX ; xDX can be A1 or A2 depending on the calling convention
6287 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6288 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6289 IBT_NOTRACK
6290 call T1
6291
6292 pop xDX
6293 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6294 mov [T2], ecx
6295
6296 IEMIMPL_SSE_EPILOGUE
6297 EPILOGUE_4_ARGS
6298 %assign bImm 0
6299 %rep 256
6300.imm %+ bImm:
6301 IBT_ENDBRxx_WITHOUT_NOTRACK
6302 db 0xc4, 0xe3, 0xf9, 0x61, 0xc1, bImm ; vpcmpestri xmm0,xmm1,0x1 with VEX.W set
6303 ret
6304 int3
6305 %assign bImm bImm + 1
6306 %endrep
6307.immEnd:
6308ENDPROC iemAImpl_vpcmpestri_u128
6309
6310
6311;;
6312; The pcmpistrm/vpcmpistrm instruction template.
6313;
6314; @param 1 The instruction name
6315;
6316; @param A0 Pointer to the XMM0 register to store the result to (output).
6317; @param A1 Pointer to the EFLAGS register.
6318; @param A2 Pointer to the structure containing the source operands (input).
6319; @param A3 The 8-bit immediate
6320;
6321%macro IEMIMPL_MEDIA_V_CMPISTRM 1
6322BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6323 PROLOGUE_4_ARGS
6324 IEMIMPL_SSE_PROLOGUE
6325
6326 movzx A3, A3_8 ; must clear top bits
6327 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6328 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6329 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6330
6331 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6332 movdqu [A0], xmm0
6333
6334 IEMIMPL_SSE_EPILOGUE
6335 EPILOGUE_4_ARGS
6336 %assign bImm 0
6337 %rep 256
6338.imm %+ bImm:
6339 IBT_ENDBRxx_WITHOUT_NOTRACK
6340 %1 xmm1, xmm2, bImm
6341 ret
6342 int3
6343 %assign bImm bImm + 1
6344 %endrep
6345.immEnd:
6346ENDPROC iemAImpl_ %+ %1 %+ _u128
6347%endmacro
6348
6349IEMIMPL_MEDIA_V_CMPISTRM pcmpistrm
6350IEMIMPL_MEDIA_V_CMPISTRM vpcmpistrm
6351
6352
6353;;
6354; The pcmpestrm instruction.
6355;
6356; @param A0 Pointer to the XMM0 register to store the result to (output).
6357; @param A1 Pointer to the EFLAGS register.
6358; @param A2 Pointer to the structure containing the source operands (input).
6359; @param A3 The 8-bit immediate
6360;
6361BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6362 PROLOGUE_4_ARGS
6363 IEMIMPL_SSE_PROLOGUE
6364
6365 movzx A3, A3_8 ; must clear top bits
6366 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6367 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6368 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6369 push xDX ; xDX can be A1 or A2 depending on the calling convention
6370 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6371 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6372 IBT_NOTRACK
6373 call T1
6374
6375 pop xDX
6376 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6377 movdqu [A0], xmm0
6378
6379 IEMIMPL_SSE_EPILOGUE
6380 EPILOGUE_4_ARGS
6381 %assign bImm 0
6382 %rep 256
6383.imm %+ bImm:
6384 IBT_ENDBRxx_WITHOUT_NOTRACK
6385 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6386 pcmpestrm xmm1, xmm2, bImm
6387 ret
6388 %assign bImm bImm + 1
6389 %endrep
6390.immEnd:
6391ENDPROC iemAImpl_pcmpestrm_u128
6392
6393
6394;;
6395; The vpcmpestrm instruction.
6396;
6397; @param A0 Pointer to the XMM0 register to store the result to (output).
6398; @param A1 Pointer to the EFLAGS register.
6399; @param A2 Pointer to the structure containing the source operands (input).
6400; @param A3 The 8-bit immediate
6401;
6402BEGINPROC_FASTCALL iemAImpl_vpcmpestrm_u128, 16
6403 PROLOGUE_4_ARGS
6404 IEMIMPL_SSE_PROLOGUE
6405
6406 movzx A3, A3_8 ; must clear top bits
6407 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6408 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6409 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6410 push xDX ; xDX can be A1 or A2 depending on the calling convention
6411 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6412 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6413 IBT_NOTRACK
6414 call T1
6415
6416 pop xDX
6417 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6418 movdqu [A0], xmm0
6419
6420 IEMIMPL_SSE_EPILOGUE
6421 EPILOGUE_4_ARGS
6422 %assign bImm 0
6423 %rep 256
6424.imm %+ bImm:
6425 IBT_ENDBRxx_WITHOUT_NOTRACK
6426 db 0xc4, 0xe3, 0xf9, 0x60, 0xca, bImm ; vpcmpestrm xmm1, xmm2, bImm with VEX.W set
6427 ret
6428 int3
6429 %assign bImm bImm + 1
6430 %endrep
6431.immEnd:
6432ENDPROC iemAImpl_vpcmpestrm_u128
6433
6434
6435;;
6436; movmskp{s,d} SSE instruction template
6437;
6438; @param 1 The SSE instruction name.
6439; @param 2 The AVX instruction name.
6440;
6441; @param A0 Pointer to the output register (output/byte sized).
6442; @param A1 Pointer to the source media register size operand (input).
6443;
6444%macro IEMIMPL_MEDIA_MOVMSK_P 2
6445BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6446 PROLOGUE_2_ARGS
6447 IEMIMPL_SSE_PROLOGUE
6448
6449 movdqu xmm0, [A1]
6450 %1 T0, xmm0
6451 mov byte [A0], T0_8
6452
6453 IEMIMPL_SSE_EPILOGUE
6454 EPILOGUE_2_ARGS
6455ENDPROC iemAImpl_ %+ %1 %+ _u128
6456
6457BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6458 PROLOGUE_2_ARGS
6459 IEMIMPL_AVX_PROLOGUE
6460
6461 movdqu xmm0, [A1]
6462 %2 T0, xmm0
6463 mov byte [A0], T0_8
6464
6465 IEMIMPL_AVX_EPILOGUE
6466 EPILOGUE_2_ARGS
6467ENDPROC iemAImpl_ %+ %2 %+ _u128
6468
6469BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6470 PROLOGUE_2_ARGS
6471 IEMIMPL_AVX_PROLOGUE
6472
6473 vmovdqu ymm0, [A1]
6474 %2 T0, ymm0
6475 mov byte [A0], T0_8
6476
6477 IEMIMPL_AVX_EPILOGUE
6478 EPILOGUE_2_ARGS
6479ENDPROC iemAImpl_ %+ %2 %+ _u256
6480%endmacro
6481
6482IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6483IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6484
6485
6486;;
6487; Template for [v]cvttss2si/[v]cvtss2si instructions.
6488;
6489; @param 1 Instruction name.
6490; @param 2 AVX or SSE
6491;
6492; @return R0_32 The new MXCSR value of the guest.
6493; @param A0_32 The guest's MXCSR register value to use.
6494; @param A1 Pointer to the result operand (output).
6495; @param A2 Pointer to the second operand (input).
6496;
6497%macro IEMIMPL_MEDIA_V_CVTXSS2SI 2
6498BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r32, 16
6499 PROLOGUE_3_ARGS
6500 IEMIMPL_ %+ %2 %+ _PROLOGUE
6501 SSE_AVX_LD_MXCSR A0_32
6502
6503 %1 T0_32, [A2]
6504 mov dword [A1], T0_32
6505
6506 SSE_AVX_ST_MXCSR R0_32, A0_32
6507 IEMIMPL_ %+ %2 %+ _EPILOGUE
6508 EPILOGUE_3_ARGS
6509ENDPROC iemAImpl_ %+ %1 %+ _i32_r32
6510
6511
6512BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r32, 16
6513 PROLOGUE_3_ARGS
6514 IEMIMPL_ %+ %2 %+ _PROLOGUE
6515 SSE_AVX_LD_MXCSR A0_32
6516
6517 %1 T0, [A2]
6518 mov qword [A1], T0
6519
6520 SSE_AVX_ST_MXCSR R0_32, A0_32
6521 IEMIMPL_ %+ %2 %+ _EPILOGUE
6522 EPILOGUE_3_ARGS
6523ENDPROC iemAImpl_ %+ %1 %+ _i64_r32
6524%endmacro
6525
6526IEMIMPL_MEDIA_V_CVTXSS2SI cvttss2si, SSE
6527IEMIMPL_MEDIA_V_CVTXSS2SI vcvttss2si, AVX
6528IEMIMPL_MEDIA_V_CVTXSS2SI cvtss2si, SSE
6529IEMIMPL_MEDIA_V_CVTXSS2SI vcvtss2si, AVX
6530
6531
6532;;
6533; Template for [v]cvttsd2si/[v]cvtsd2si instructions.
6534;
6535; @param 1 Instruction name.
6536; @param 2 AVX or SSE
6537;
6538; @return R0_32 The new MXCSR value of the guest.
6539; @param A0_32 The guest's MXCSR register value to use.
6540; @param A1 Pointer to the result operand (output).
6541; @param A2 Pointer to the second operand (input).
6542;
6543%macro IEMIMPL_MEDIA_V_CVTXSD2SI 2
6544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r64, 16
6545 PROLOGUE_3_ARGS
6546 IEMIMPL_ %+ %2 %+ _PROLOGUE
6547 SSE_AVX_LD_MXCSR A0_32
6548
6549 %1 T0_32, [A2]
6550 mov dword [A1], T0_32
6551
6552 SSE_AVX_ST_MXCSR R0_32, A0_32
6553 IEMIMPL_ %+ %2 %+ _EPILOGUE
6554 EPILOGUE_3_ARGS
6555ENDPROC iemAImpl_ %+ %1 %+ _i32_r64
6556
6557
6558BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r64, 16
6559 PROLOGUE_3_ARGS
6560 IEMIMPL_ %+ %2 %+ _PROLOGUE
6561 SSE_AVX_LD_MXCSR A0_32
6562
6563 %1 T0, [A2]
6564 mov qword [A1], T0
6565
6566 SSE_AVX_ST_MXCSR R0_32, A0_32
6567 IEMIMPL_ %+ %2 %+ _EPILOGUE
6568 EPILOGUE_3_ARGS
6569ENDPROC iemAImpl_ %+ %1 %+ _i64_r64
6570%endmacro
6571
6572IEMIMPL_MEDIA_V_CVTXSD2SI cvttsd2si, SSE
6573IEMIMPL_MEDIA_V_CVTXSD2SI vcvttsd2si, AVX
6574IEMIMPL_MEDIA_V_CVTXSD2SI cvtsd2si, SSE
6575IEMIMPL_MEDIA_V_CVTXSD2SI vcvtsd2si, AVX
6576
6577
6578;;
6579; cvtsi2ss instruction - 32-bit variant.
6580;
6581; @return R0_32 The new MXCSR value of the guest.
6582; @param A0_32 The guest's MXCSR register value to use.
6583; @param A1 Pointer to the result operand (output).
6584; @param A2 Pointer to the second operand (input).
6585;
6586BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6587 PROLOGUE_3_ARGS
6588 IEMIMPL_SSE_PROLOGUE
6589 SSE_AVX_LD_MXCSR A0_32
6590
6591 cvtsi2ss xmm0, dword [A2]
6592 movd dword [A1], xmm0
6593
6594 SSE_AVX_ST_MXCSR R0_32, A0_32
6595 IEMIMPL_SSE_EPILOGUE
6596 EPILOGUE_3_ARGS
6597ENDPROC iemAImpl_cvtsi2ss_r32_i32
6598
6599
6600;;
6601; vcvtsi2ss instruction - 32-bit variant.
6602;
6603; @return R0_32 The new MXCSR value of the guest.
6604; @param A0_32 The guest's MXCSR register value to use.
6605; @param A1 Pointer to the result operand (output).
6606; @param A2 Pointer to the second operand (input).
6607; @param A3 Pointer to the third operand (input).
6608;
6609BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i32, 16
6610 PROLOGUE_3_ARGS
6611 IEMIMPL_AVX_PROLOGUE
6612 SSE_AVX_LD_MXCSR A0_32
6613
6614 movdqu xmm0, [A2]
6615 vcvtsi2ss xmm0, xmm0, dword [A3]
6616 movdqu [A1], xmm0
6617
6618 SSE_AVX_ST_MXCSR R0_32, A0_32
6619 IEMIMPL_AVX_EPILOGUE
6620 EPILOGUE_3_ARGS
6621ENDPROC iemAImpl_vcvtsi2ss_u128_i32
6622
6623
6624;;
6625; cvtsi2ss instruction - 64-bit variant.
6626;
6627; @return R0_32 The new MXCSR value of the guest.
6628; @param A0_32 The guest's MXCSR register value to use.
6629; @param A1 Pointer to the result operand (output).
6630; @param A2 Pointer to the second operand (input).
6631;
6632BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6633 PROLOGUE_3_ARGS
6634 IEMIMPL_SSE_PROLOGUE
6635 SSE_AVX_LD_MXCSR A0_32
6636
6637 cvtsi2ss xmm0, qword [A2]
6638 movd dword [A1], xmm0
6639
6640 SSE_AVX_ST_MXCSR R0_32, A0_32
6641 IEMIMPL_SSE_EPILOGUE
6642 EPILOGUE_3_ARGS
6643ENDPROC iemAImpl_cvtsi2ss_r32_i64
6644
6645
6646;;
6647; vcvtsi2ss instruction - 64-bit variant.
6648;
6649; @return R0_32 The new MXCSR value of the guest.
6650; @param A0_32 The guest's MXCSR register value to use.
6651; @param A1 Pointer to the result operand (output).
6652; @param A2 Pointer to the second operand (input).
6653; @param A3 Pointer to the third operand (input).
6654;
6655BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i64, 16
6656 PROLOGUE_3_ARGS
6657 IEMIMPL_AVX_PROLOGUE
6658 SSE_AVX_LD_MXCSR A0_32
6659
6660 movdqu xmm0, [A2]
6661 vcvtsi2ss xmm0, xmm0, qword [A3]
6662 movdqu [A1], xmm0
6663
6664 SSE_AVX_ST_MXCSR R0_32, A0_32
6665 IEMIMPL_AVX_EPILOGUE
6666 EPILOGUE_3_ARGS
6667ENDPROC iemAImpl_vcvtsi2ss_u128_i64
6668
6669
6670;;
6671; cvtsi2sd instruction - 32-bit variant.
6672;
6673; @return R0_32 The new MXCSR value of the guest.
6674; @param A0_32 The guest's MXCSR register value to use.
6675; @param A1 Pointer to the result operand (output).
6676; @param A2 Pointer to the second operand (input).
6677;
6678BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6679 PROLOGUE_3_ARGS
6680 IEMIMPL_SSE_PROLOGUE
6681 SSE_AVX_LD_MXCSR A0_32
6682
6683 cvtsi2sd xmm0, dword [A2]
6684 movq [A1], xmm0
6685
6686 SSE_AVX_ST_MXCSR R0_32, A0_32
6687 IEMIMPL_SSE_EPILOGUE
6688 EPILOGUE_3_ARGS
6689ENDPROC iemAImpl_cvtsi2sd_r64_i32
6690
6691
6692;;
6693; vcvtsi2sd instruction - 32-bit variant.
6694;
6695; @return R0_32 The new MXCSR value of the guest.
6696; @param A0_32 The guest's MXCSR register value to use.
6697; @param A1 Pointer to the result operand (output).
6698; @param A2 Pointer to the second operand (input).
6699; @param A3 Pointer to the third operand (input).
6700;
6701BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i32, 16
6702 PROLOGUE_3_ARGS
6703 IEMIMPL_AVX_PROLOGUE
6704 SSE_AVX_LD_MXCSR A0_32
6705
6706 movdqu xmm0, [A2]
6707 vcvtsi2sd xmm0, xmm0, dword [A3]
6708 movdqu [A1], xmm0
6709
6710 SSE_AVX_ST_MXCSR R0_32, A0_32
6711 IEMIMPL_AVX_EPILOGUE
6712 EPILOGUE_3_ARGS
6713ENDPROC iemAImpl_vcvtsi2sd_u128_i32
6714
6715
6716;;
6717; cvtsi2sd instruction - 64-bit variant.
6718;
6719; @return R0_32 The new MXCSR value of the guest.
6720; @param A0_32 The guest's MXCSR register value to use.
6721; @param A1 Pointer to the result operand (output).
6722; @param A2 Pointer to the second operand (input).
6723;
6724BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6725 PROLOGUE_3_ARGS
6726 IEMIMPL_SSE_PROLOGUE
6727 SSE_AVX_LD_MXCSR A0_32
6728
6729 cvtsi2sd xmm0, qword [A2]
6730 movq [A1], xmm0
6731
6732 SSE_AVX_ST_MXCSR R0_32, A0_32
6733 IEMIMPL_SSE_EPILOGUE
6734 EPILOGUE_3_ARGS
6735ENDPROC iemAImpl_cvtsi2sd_r64_i64
6736
6737
6738;;
6739; vcvtsi2sd instruction - 64-bit variant.
6740;
6741; @return R0_32 The new MXCSR value of the guest.
6742; @param A0_32 The guest's MXCSR register value to use.
6743; @param A1 Pointer to the result operand (output).
6744; @param A2 Pointer to the second operand (input).
6745; @param A3 Pointer to the third operand (input).
6746;
6747BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i64, 16
6748 PROLOGUE_3_ARGS
6749 IEMIMPL_AVX_PROLOGUE
6750 SSE_AVX_LD_MXCSR A0_32
6751
6752 movdqu xmm0, [A2]
6753 vcvtsi2sd xmm0, xmm0, qword [A3]
6754 movdqu [A1], xmm0
6755
6756 SSE_AVX_ST_MXCSR R0_32, A0_32
6757 IEMIMPL_AVX_EPILOGUE
6758 EPILOGUE_3_ARGS
6759ENDPROC iemAImpl_vcvtsi2sd_u128_i64
6760
6761
6762;
6763; UCOMISS (SSE)
6764;
6765; @return R0_32 The new MXCSR value of the guest.
6766; @param A0_32 The guest's MXCSR register value to use (input).
6767; @param A1 Pointer to the EFLAGS value (input/output).
6768; @param A2_32 The first source operand.
6769; @param A3_32 The second source operand.
6770;
6771BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6772 PROLOGUE_4_ARGS
6773 IEMIMPL_SSE_PROLOGUE
6774 SSE_AVX_LD_MXCSR A0_32
6775
6776 movd xmm0, A2_32
6777 movd xmm1, A3_32
6778 ucomiss xmm0, xmm1
6779 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6780
6781 SSE_AVX_ST_MXCSR R0_32, A0_32
6782 IEMIMPL_SSE_EPILOGUE
6783 EPILOGUE_4_ARGS
6784ENDPROC iemAImpl_ucomiss_u128
6785
6786BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6787 PROLOGUE_4_ARGS
6788 IEMIMPL_SSE_PROLOGUE
6789 SSE_AVX_LD_MXCSR A0_32
6790
6791 movd xmm0, A2_32
6792 movd xmm1, A3_32
6793 vucomiss xmm0, xmm1
6794 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6795
6796 SSE_AVX_ST_MXCSR R0_32, A0_32
6797 IEMIMPL_SSE_EPILOGUE
6798 EPILOGUE_3_ARGS
6799ENDPROC iemAImpl_vucomiss_u128
6800
6801
6802;
6803; UCOMISD (SSE)
6804;
6805; @return R0_32 The new MXCSR value of the guest.
6806; @param A0_32 The guest's MXCSR register value to use (input).
6807; @param A1 Pointer to the EFLAGS value (input/output).
6808; @param A2 The first source operand.
6809; @param A3 The second source operand.
6810;
6811BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6812 PROLOGUE_4_ARGS
6813 IEMIMPL_SSE_PROLOGUE
6814 SSE_AVX_LD_MXCSR A0_32
6815
6816 movq xmm0, A2
6817 movq xmm1, A3
6818 ucomisd xmm0, xmm1
6819 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6820
6821 SSE_AVX_ST_MXCSR R0_32, A0_32
6822 IEMIMPL_SSE_EPILOGUE
6823 EPILOGUE_4_ARGS
6824ENDPROC iemAImpl_ucomisd_u128
6825
6826BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6827 PROLOGUE_4_ARGS
6828 IEMIMPL_SSE_PROLOGUE
6829 SSE_AVX_LD_MXCSR A0_32
6830
6831 movq xmm0, A2
6832 movq xmm1, A3
6833 vucomisd xmm0, xmm1
6834 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6835
6836 SSE_AVX_ST_MXCSR R0_32, A0_32
6837 IEMIMPL_SSE_EPILOGUE
6838 EPILOGUE_4_ARGS
6839ENDPROC iemAImpl_vucomisd_u128
6840
6841;
6842; COMISS (SSE)
6843;
6844; @return R0_32 The new MXCSR value of the guest.
6845; @param A0_32 The guest's MXCSR register value to use (input).
6846; @param A1 Pointer to the EFLAGS value (input/output).
6847; @param A2_32 The first source operand.
6848; @param A3_32 The second source operand.
6849;
6850BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6851 PROLOGUE_4_ARGS
6852 IEMIMPL_SSE_PROLOGUE
6853 SSE_AVX_LD_MXCSR A0_32
6854
6855 movd xmm0, A2_32
6856 movd xmm1, A3_32
6857 comiss xmm0, xmm1
6858 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6859
6860 SSE_AVX_ST_MXCSR R0_32, A0_32
6861 IEMIMPL_SSE_EPILOGUE
6862 EPILOGUE_4_ARGS
6863ENDPROC iemAImpl_comiss_u128
6864
6865BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6866 PROLOGUE_4_ARGS
6867 IEMIMPL_SSE_PROLOGUE
6868 SSE_AVX_LD_MXCSR A0_32
6869
6870 movd xmm0, A2_32
6871 movd xmm1, A3_32
6872 vcomiss xmm0, xmm1
6873 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6874
6875 SSE_AVX_ST_MXCSR R0_32, A0_32
6876 IEMIMPL_SSE_EPILOGUE
6877 EPILOGUE_4_ARGS
6878ENDPROC iemAImpl_vcomiss_u128
6879
6880
6881;
6882; COMISD (SSE)
6883;
6884; @return R0_32 The new MXCSR value of the guest.
6885; @param A0_32 The guest's MXCSR register value to use (input).
6886; @param A1 Pointer to the EFLAGS value (input/output).
6887; @param A2 The first source operand.
6888; @param A3 The second source operand.
6889;
6890BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6891 PROLOGUE_4_ARGS
6892 IEMIMPL_SSE_PROLOGUE
6893 SSE_AVX_LD_MXCSR A0_32
6894
6895 movq xmm0, A2
6896 movq xmm1, A3
6897 comisd xmm0, xmm1
6898 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6899
6900 SSE_AVX_ST_MXCSR R0_32, A0_32
6901 IEMIMPL_SSE_EPILOGUE
6902 EPILOGUE_4_ARGS
6903ENDPROC iemAImpl_comisd_u128
6904
6905BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6906 PROLOGUE_4_ARGS
6907 IEMIMPL_SSE_PROLOGUE
6908 SSE_AVX_LD_MXCSR A0_32
6909
6910 movq xmm0, A2
6911 movq xmm1, A3
6912 vcomisd xmm0, xmm1
6913 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6914
6915 SSE_AVX_ST_MXCSR R0_32, A0_32
6916 IEMIMPL_SSE_EPILOGUE
6917 EPILOGUE_4_ARGS
6918ENDPROC iemAImpl_vcomisd_u128
6919
6920
6921;;
6922; Need to move this as well somewhere better?
6923;
6924struc IEMMEDIAF2XMMSRC
6925 .uSrc1 resd 4
6926 .uSrc2 resd 4
6927endstruc
6928
6929
6930struc IEMMEDIAF2YMMSRC
6931 .uSrc1 resd 8
6932 .uSrc2 resd 8
6933endstruc
6934
6935
6936;;
6937; SSE/AVX instructions with 8-bit immediates of the form
6938; xxx xmm1, xmm2, imm8.
6939; vxxx xmm1, xmm2, xmm3, imm8.
6940; and we need to load and save the MXCSR register.
6941;
6942; @param 1 The instruction name.
6943; @param 2 Flag whether this instruction has a 256-bit AVX variant (1) or not (0).
6944; @param 3 Number of bytes for the encoding of the SSE variant + ret instruction (AVX is fixed to 6).
6945;
6946; @return R0_32 The new MXCSR value of the guest.
6947; @param A0_32 The guest's MXCSR register value to use (input).
6948; @param A1 Pointer to the first media register size operand (output).
6949; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6950; @param A3 The 8-bit immediate (input).
6951;
6952%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR 3
6953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6954 PROLOGUE_4_ARGS
6955 IEMIMPL_SSE_PROLOGUE
6956 SSE_AVX_LD_MXCSR A0_32
6957
6958 movzx A3, A3_8 ; must clear top bits
6959 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6960 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6961 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, %3
6962 movdqu [A1], xmm0
6963
6964 SSE_AVX_ST_MXCSR R0_32, A0_32
6965 IEMIMPL_SSE_EPILOGUE
6966 EPILOGUE_4_ARGS
6967 %assign bImm 0
6968 %rep 256
6969.imm %+ bImm:
6970 IBT_ENDBRxx_WITHOUT_NOTRACK
6971 %1 xmm0, xmm1, bImm
6972 ret
6973 %assign bImm bImm + 1
6974 %endrep
6975.immEnd:
6976ENDPROC iemAImpl_ %+ %1 %+ _u128
6977
6978
6979BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
6980 PROLOGUE_4_ARGS
6981 IEMIMPL_SSE_PROLOGUE
6982 SSE_AVX_LD_MXCSR A0_32
6983
6984 movzx A3, A3_8 ; must clear top bits
6985 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6986 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6987 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
6988 movdqu [A1], xmm0
6989
6990 SSE_AVX_ST_MXCSR R0_32, A0_32
6991 IEMIMPL_SSE_EPILOGUE
6992 EPILOGUE_4_ARGS
6993 %assign bImm 0
6994 %rep 256
6995.imm %+ bImm:
6996 IBT_ENDBRxx_WITHOUT_NOTRACK
6997 v %+ %1 xmm0, xmm0, xmm1, bImm
6998 ret
6999 %assign bImm bImm + 1
7000 %endrep
7001.immEnd:
7002ENDPROC iemAImpl_v %+ %1 %+ _u128
7003
7004 %if %2 == 1
7005BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7006 PROLOGUE_4_ARGS
7007 IEMIMPL_SSE_PROLOGUE
7008 SSE_AVX_LD_MXCSR A0_32
7009
7010 movzx A3, A3_8 ; must clear top bits
7011 vmovdqu ymm0, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7012 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7013 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7014 vmovdqu [A1], ymm0
7015
7016 SSE_AVX_ST_MXCSR R0_32, A0_32
7017 IEMIMPL_SSE_EPILOGUE
7018 EPILOGUE_4_ARGS
7019 %assign bImm 0
7020 %rep 256
7021.imm %+ bImm:
7022 IBT_ENDBRxx_WITHOUT_NOTRACK
7023 v %+ %1 ymm0, ymm0, ymm1, bImm
7024 ret
7025 %assign bImm bImm + 1
7026 %endrep
7027.immEnd:
7028ENDPROC iemAImpl_v %+ %1 %+ _u256
7029 %endif
7030%endmacro
7031
7032IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpps, 1, 5
7033IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmppd, 1, 6
7034IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpss, 0, 6
7035IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpsd, 0, 6
7036
7037
7038;;
7039; SSE/AVX instructions with 2 full sized perands and an 8-bit immediate of the form
7040; xxx xmm1, xmm2, imm8.
7041; vxxx xmm1, xmm2, imm8
7042; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7043; register.
7044;
7045; @param 1 The instruction name.
7046;
7047; @return R0_32 The new MXCSR value of the guest.
7048; @param A0_32 The guest's MXCSR register value to use (input).
7049; @param A1 Pointer to the first media register size operand (output).
7050; @param A2 Pointer to the second media register size operand (input).
7051; @param A3 The 8-bit immediate (input).
7052;
7053%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 1
7054BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7055 PROLOGUE_4_ARGS
7056 IEMIMPL_SSE_PROLOGUE
7057 SSE_AVX_LD_MXCSR A0_32
7058
7059 movzx A3, A3_8 ; must clear top bits
7060 movdqu xmm1, [A2]
7061 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7062 movdqu [A1], xmm0
7063
7064 SSE_AVX_ST_MXCSR R0_32, A0_32
7065 IEMIMPL_SSE_EPILOGUE
7066 EPILOGUE_4_ARGS
7067 %assign bImm 0
7068 %rep 256
7069.imm %+ bImm:
7070 IBT_ENDBRxx_WITHOUT_NOTRACK
7071 %1 xmm0, xmm1, bImm
7072 ret
7073 int3
7074 %assign bImm bImm + 1
7075 %endrep
7076.immEnd:
7077ENDPROC iemAImpl_ %+ %1 %+ _u128
7078
7079BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7080 PROLOGUE_4_ARGS
7081 IEMIMPL_SSE_PROLOGUE
7082 SSE_AVX_LD_MXCSR A0_32
7083
7084 movzx A3, A3_8 ; must clear top bits
7085 movdqu xmm1, [A2]
7086 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7087 movdqu [A1], xmm0
7088
7089 SSE_AVX_ST_MXCSR R0_32, A0_32
7090 IEMIMPL_SSE_EPILOGUE
7091 EPILOGUE_4_ARGS
7092 %assign bImm 0
7093 %rep 256
7094.imm %+ bImm:
7095 IBT_ENDBRxx_WITHOUT_NOTRACK
7096 v%1 xmm0, xmm1, bImm
7097 ret
7098 int3
7099 %assign bImm bImm + 1
7100 %endrep
7101.immEnd:
7102ENDPROC iemAImpl_v %+ %1 %+ _u128
7103
7104BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7105 PROLOGUE_4_ARGS
7106 IEMIMPL_SSE_PROLOGUE
7107 SSE_AVX_LD_MXCSR A0_32
7108
7109 movzx A3, A3_8 ; must clear top bits
7110 vmovdqu ymm1, [A2]
7111 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7112 vmovdqu [A1], ymm0
7113
7114 SSE_AVX_ST_MXCSR R0_32, A0_32
7115 IEMIMPL_SSE_EPILOGUE
7116 EPILOGUE_4_ARGS
7117 %assign bImm 0
7118 %rep 256
7119.imm %+ bImm:
7120 IBT_ENDBRxx_WITHOUT_NOTRACK
7121 v%1 ymm0, ymm1, bImm
7122 ret
7123 int3
7124 %assign bImm bImm + 1
7125 %endrep
7126.immEnd:
7127ENDPROC iemAImpl_v %+ %1 %+ _u256
7128%endmacro
7129
7130IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundps
7131IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundpd
7132
7133
7134;;
7135; SSE/AVX instructions with 3 full sized perands and an 8-bit immediate of the form
7136; xxx xmm1, xmm2, imm8.
7137; vxxx xmm1, xmm2, xmm3, imm8
7138; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7139; register.
7140;
7141; @param 1 The instruction name.
7142; @param 2 Flag whether to emit a 256-bit AVX variant (1) or not (0).
7143;
7144; @return R0_32 The new MXCSR value of the guest.
7145; @param A0_32 The guest's MXCSR register value to use (input).
7146; @param A1 Pointer to the first media register size operand (output).
7147; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC/IEMMEDIAF2YMMSRC (input).
7148; @param A3 The 8-bit immediate (input).
7149;
7150%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 2
7151BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7152 PROLOGUE_4_ARGS
7153 IEMIMPL_SSE_PROLOGUE
7154 SSE_AVX_LD_MXCSR A0_32
7155
7156 movzx A3, A3_8 ; must clear top bits
7157 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7158 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7159 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7160 movdqu [A1], xmm0
7161
7162 SSE_AVX_ST_MXCSR R0_32, A0_32
7163 IEMIMPL_SSE_EPILOGUE
7164 EPILOGUE_4_ARGS
7165 %assign bImm 0
7166 %rep 256
7167.imm %+ bImm:
7168 IBT_ENDBRxx_WITHOUT_NOTRACK
7169 %1 xmm0, xmm1, bImm
7170 ret
7171 int3
7172 %assign bImm bImm + 1
7173 %endrep
7174.immEnd:
7175ENDPROC iemAImpl_ %+ %1 %+ _u128
7176
7177
7178BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7179 PROLOGUE_4_ARGS
7180 IEMIMPL_SSE_PROLOGUE
7181 SSE_AVX_LD_MXCSR A0_32
7182
7183 movzx A3, A3_8 ; must clear top bits
7184 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7185 movdqu xmm2, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7186 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7187 movdqu [A1], xmm0
7188
7189 SSE_AVX_ST_MXCSR R0_32, A0_32
7190 IEMIMPL_SSE_EPILOGUE
7191 EPILOGUE_4_ARGS
7192 %assign bImm 0
7193 %rep 256
7194.imm %+ bImm:
7195 IBT_ENDBRxx_WITHOUT_NOTRACK
7196 v %+ %1 xmm0, xmm1, xmm2, bImm
7197 ret
7198 int3
7199 %assign bImm bImm + 1
7200 %endrep
7201.immEnd:
7202ENDPROC iemAImpl_v %+ %1 %+ _u128
7203
7204
7205 %if %2 == 1
7206BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7207 PROLOGUE_4_ARGS
7208 IEMIMPL_SSE_PROLOGUE
7209 SSE_AVX_LD_MXCSR A0_32
7210
7211 movzx A3, A3_8 ; must clear top bits
7212 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7213 vmovdqu ymm2, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7214 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7215 vmovdqu [A1], ymm0
7216
7217 SSE_AVX_ST_MXCSR R0_32, A0_32
7218 IEMIMPL_SSE_EPILOGUE
7219 EPILOGUE_4_ARGS
7220 %assign bImm 0
7221 %rep 256
7222.imm %+ bImm:
7223 IBT_ENDBRxx_WITHOUT_NOTRACK
7224 v %+ %1 ymm0, ymm1, ymm2, bImm
7225 ret
7226 int3
7227 %assign bImm bImm + 1
7228 %endrep
7229.immEnd:
7230ENDPROC iemAImpl_v %+ %1 %+ _u256
7231 %endif
7232%endmacro
7233
7234IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundss, 0
7235IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundsd, 0
7236IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dpps, 1
7237IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dppd, 0
7238
7239
7240;;
7241; SSE instructions of the form
7242; xxx mm, xmm.
7243; and we need to load and save the MXCSR register.
7244;
7245; @param 1 The instruction name.
7246;
7247; @return R0_32 The new MXCSR value of the guest.
7248; @param A0_32 The guest's MXCSR register value to use (input).
7249; @param A1 Pointer to the first MMX register sized operand (output).
7250; @param A2 Pointer to the media register sized operand (input).
7251;
7252%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
7253BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7254 PROLOGUE_3_ARGS
7255 IEMIMPL_SSE_PROLOGUE
7256 SSE_AVX_LD_MXCSR A0_32
7257
7258 movdqu xmm0, [A2]
7259 %1 mm0, xmm0
7260 movq [A1], mm0
7261
7262 SSE_AVX_ST_MXCSR R0_32, A0_32
7263 IEMIMPL_SSE_EPILOGUE
7264 EPILOGUE_3_ARGS
7265ENDPROC iemAImpl_ %+ %1 %+ _u128
7266%endmacro
7267
7268IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
7269IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
7270
7271;;
7272; SSE instructions of the form
7273; xxx xmm, xmm/m64.
7274; and we need to load and save the MXCSR register.
7275;
7276; @param 1 The instruction name.
7277;
7278; @return R0_32 The new MXCSR value of the guest.
7279; @param A0_32 The guest's MXCSR register value to use (input).
7280; @param A1 Pointer to the first media register sized operand (input/output).
7281; @param A2 The 64bit source value from a MMX media register (input)
7282;
7283%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
7284BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7285 PROLOGUE_3_ARGS
7286 IEMIMPL_SSE_PROLOGUE
7287 SSE_AVX_LD_MXCSR A0_32
7288
7289 movdqu xmm0, [A1]
7290 movq mm0, A2
7291 %1 xmm0, mm0
7292 movdqu [A1], xmm0
7293
7294 SSE_AVX_ST_MXCSR R0_32, A0_32
7295 IEMIMPL_SSE_EPILOGUE
7296 EPILOGUE_3_ARGS
7297ENDPROC iemAImpl_ %+ %1 %+ _u128
7298%endmacro
7299
7300IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
7301IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
7302
7303;;
7304; SSE instructions of the form
7305; xxx mm, xmm/m64.
7306; and we need to load and save the MXCSR register.
7307;
7308; @param 1 The instruction name.
7309;
7310; @return R0_32 The new MXCSR value of the guest.
7311; @param A0_32 The guest's MXCSR register value to use (input).
7312; @param A1 Pointer to the first MMX media register sized operand (output).
7313; @param A2 The 64bit source value (input).
7314;
7315%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
7316BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7317 PROLOGUE_3_ARGS
7318 IEMIMPL_SSE_PROLOGUE
7319 SSE_AVX_LD_MXCSR A0_32
7320
7321 movq xmm0, A2
7322 %1 mm0, xmm0
7323 movq [A1], mm0
7324
7325 SSE_AVX_ST_MXCSR R0_32, A0_32
7326 IEMIMPL_SSE_EPILOGUE
7327 EPILOGUE_3_ARGS
7328ENDPROC iemAImpl_ %+ %1 %+ _u128
7329%endmacro
7330
7331IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
7332IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
7333
7334;
7335; All forms of RDRAND and RDSEED
7336;
7337; @param A0 Pointer to the destination operand.
7338; @param A1 Pointer to the EFLAGS value (input/output).
7339;
7340%macro IEMIMPL_RDRAND_RDSEED 3
7341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
7342 PROLOGUE_2_ARGS
7343
7344 %1 %2
7345 mov [A0], %2
7346 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
7347
7348 EPILOGUE_2_ARGS
7349ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
7350%endmacro
7351
7352IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
7353IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
7354IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
7355IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
7356IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
7357IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
7358
7359
7360;;
7361; sha1rnds4 xmm1, xmm2, imm8.
7362;
7363; @param 1 The instruction name.
7364;
7365; @param A0 Pointer to the first media register size operand (input/output).
7366; @param A1 Pointer to the second source media register size operand (input).
7367; @param A2 The 8-bit immediate
7368;
7369BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
7370 PROLOGUE_3_ARGS
7371 IEMIMPL_SSE_PROLOGUE
7372
7373 movzx A2, A2_8 ; must clear top bits
7374 movdqu xmm0, [A0]
7375 movdqu xmm1, [A1]
7376 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
7377 movdqu [A0], xmm0
7378
7379 IEMIMPL_SSE_EPILOGUE
7380 EPILOGUE_3_ARGS
7381 %assign bImm 0
7382 %rep 256
7383.imm %+ bImm:
7384 IBT_ENDBRxx_WITHOUT_NOTRACK
7385 sha1rnds4 xmm0, xmm1, bImm
7386 ret
7387 %assign bImm bImm + 1
7388 %endrep
7389.immEnd:
7390ENDPROC iemAImpl_sha1rnds4_u128
7391
7392
7393;;
7394; sha256rnds2 xmm1, xmm2, <XMM0>.
7395;
7396; @param 1 The instruction name.
7397;
7398; @param A0 Pointer to the first media register size operand (input/output).
7399; @param A1 Pointer to the second source media register size operand (input).
7400; @param A2 Pointer to the implicit XMM0 constants (input).
7401;
7402BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
7403 PROLOGUE_3_ARGS
7404 IEMIMPL_SSE_PROLOGUE
7405
7406 movdqu xmm0, [A2]
7407 movdqu xmm1, [A0]
7408 movdqu xmm2, [A1]
7409 sha256rnds2 xmm1, xmm2
7410 movdqu [A0], xmm1
7411
7412 IEMIMPL_SSE_EPILOGUE
7413 EPILOGUE_3_ARGS
7414ENDPROC iemAImpl_sha256rnds2_u128
7415
7416
7417;
7418; 32-bit forms of ADCX and ADOX
7419;
7420; @returns Updated EFLAGS.
7421; @param A0 Incoming EFLAGS value (input).
7422; @param A1 Pointer to the destination operand (input/output).
7423; @param A2 32-bit source operand 1 (input).
7424;
7425%macro IEMIMPL_ADX_32 2
7426BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
7427 PROLOGUE_4_ARGS
7428
7429 IEM_LOAD_FLAGS A0_32, %2, 0
7430 %1 A2_32, [A1]
7431 mov [A1], A2_32
7432 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7433
7434 EPILOGUE_4_ARGS
7435ENDPROC iemAImpl_ %+ %1 %+ _u32
7436%endmacro
7437
7438;
7439; 64-bit forms of ADCX and ADOX
7440;
7441; @returns Updated EFLAGS.
7442; @param A0 Incoming EFLAGS value (input).
7443; @param A1 Pointer to the destination operand (input/output).
7444; @param A2 64-bit source operand 1 (input).
7445;
7446%macro IEMIMPL_ADX_64 2
7447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
7448 PROLOGUE_4_ARGS
7449
7450 IEM_LOAD_FLAGS A0_32, %2, 0
7451 %1 A2, [A1]
7452 mov [A1], A2
7453 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7454
7455 EPILOGUE_4_ARGS
7456ENDPROC iemAImpl_ %+ %1 %+ _u64
7457%endmacro
7458
7459IEMIMPL_ADX_32 adcx, X86_EFL_CF
7460IEMIMPL_ADX_64 adcx, X86_EFL_CF
7461
7462IEMIMPL_ADX_32 adox, X86_EFL_OF
7463IEMIMPL_ADX_64 adox, X86_EFL_OF
7464
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette