VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 107663

Last change on this file since 107663 was 107305, checked in by vboxsync, 2 months ago

IEM: Fixed IEMIMPL_FP_2 macro for a case with just one AVX input argument (the code was incorrectly expecting two).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 221.4 KB
Line 
1; $Id: IEMAllAImpl.asm 107305 2024-12-12 18:50:15Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %else
446 mov T0_32, %1
447 %endif
448%endmacro
449
450;;
451; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
452;
453; @remarks Clobbers T0, T1, stack.
454; @param 1 The parameter (A0..A3) holding the eflags value.
455; @param 2 The mask of modified flags to save.
456; @param 3 Mask of additional flags to always clear
457; @param 4 Mask of additional flags to always set.
458;
459;; @todo make it stuff the result into EAX?
460%macro IEM_SAVE_AND_ADJUST_FLAGS 4
461 %if (%2 | %3 | %4) != 0
462 pushf
463 pop T1
464 mov T0_32, %1 ; load flags.
465 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
466 and T1_32, (%2) ; select the modified flags.
467 or T0_32, T1_32 ; combine the flags.
468 %if (%4) != 0
469 or T0_32, %4 ; add the always set flags.
470 %endif
471 mov %1, T0_32 ; save the result.
472 %endif
473%endmacro
474
475;;
476; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
477; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
478;
479; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
480
481; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
482; @param 1 The parameter (A0..A3) holding the eflags value.
483; @param 2 The mask of modified flags to save.
484; @param 3 Mask of additional flags to always clear
485; @param 4 The result register to set SF by.
486; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
487; @param 6 The (full) register containing the parity table index. Will be modified!
488%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
489 pushf
490 pop T0
491 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
492 and T0_32, (%2) ; select the modified flags.
493 or T0_32, %1 ; combine the flags.
494
495 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
496 bt %4, %5 - 1
497 jnc %%sf_clear
498 or T0_32, X86_EFL_SF
499 %%sf_clear:
500
501 ; Parity last.
502 and %6, 0xff
503 %ifdef RT_ARCH_AMD64
504 lea T1, [NAME(g_afParity) xWrtRIP]
505 or T0_8, [T1 + %6]
506 %else
507 or T0_8, [NAME(g_afParity) + %6]
508 %endif
509
510 ;mov %1, T0_32 ; save the result.
511 ; ASSUMES T0 = eax!
512%endmacro
513
514;;
515; Calculates the new EFLAGS using fixed clear and set bit masks.
516;
517; @remarks Clobbers/returns T0.
518; @param 1 The parameter (A0..A3) holding the eflags value.
519; @param 2 Mask of additional flags to always clear
520; @param 3 Mask of additional flags to always set.
521;
522%macro IEM_ADJUST_FLAGS_RETVAL 3
523 mov T0_32, %1 ; Load flags. ASSUMES T0 is EAX!
524 %if (%2 | %3) != 0
525 %if (%2) != 0
526 and T0_32, ~(%2) ; Remove the always cleared flags.
527 %endif
528 %if (%3) != 0
529 or T0_32, %3 ; Add the always set flags.
530 %endif
531 %endif
532%endmacro
533
534;;
535; Calculates the new EFLAGS using fixed clear and set bit masks.
536;
537; @remarks Clobbers T0, %4, EFLAGS.
538; @param 1 The parameter (A0..A3) holding the eflags value.
539; @param 2 Mask of additional flags to always clear
540; @param 3 Mask of additional flags to always set.
541; @param 4 The (full) register containing the parity table index. Will be modified!
542;
543%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
544 mov T0_32, %1 ; Load flags.
545 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
546 %if (%3) != 0
547 or T0_32, %3 ; Add the always set flags.
548 %endif
549 and %4, 0xff
550 %ifdef RT_ARCH_AMD64
551 lea T2, [NAME(g_afParity) xWrtRIP]
552 or T0_8, [T2 + %4]
553 %else
554 or T0_8, [NAME(g_afParity) + %4]
555 %endif
556 mov %1, T0_32 ; Save the result.
557%endmacro
558
559
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564;;;; OLD EFLAGS macros.
565
566;;
567; Load the relevant flags from [%1] if there are undefined flags (%3).
568;
569; @remarks Clobbers T0, stack. Changes EFLAGS.
570; @param 1 The parameter (A0..A3) pointing to the eflags.
571; @param 2 The set of modified flags.
572; @param 3 The set of undefined flags.
573; @param 4 The flags that must be loaded.
574;
575%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
576 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
577 pushf ; store current flags
578 mov T0_32, [%1] ; load the guest flags
579 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
580 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
581 or [xSP], T0 ; merge guest flags with host flags.
582 popf ; load the mixed flags.
583
584 %elif (%3 + %4) != 0
585 %if 1 ; This approach seems faster on intel 10980XE
586 %if (%3 | %4) == X86_EFL_CF
587 ; Use bt to load bit into CF
588 bt dword [%1], X86_EFL_CF_BIT
589 %else
590 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
591 mov eax, [%1]
592 %if (%3 | %4) == X86_EFL_OF
593 ; Use ADD to set OF.
594 shl eax, 31 - X86_EFL_OF_BIT
595 add eax, 80000000h
596 %elif ((%3 | %4) & X86_EFL_OF) != 0
597 ; Use ADD to set OF.
598 xchg al, ah
599 shl al, 15 - X86_EFL_OF_BIT
600 add al, 80h
601 ; Use SAHF to set the other status flags.
602 sahf
603 %else ; OF not needed; so al -> ah and load ah into eflags.
604 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
605 shl eax, 8
606 %else
607 xchg al, ah
608 %endif
609 sahf
610 %endif
611 %endif
612
613 %else
614 pushf ; store current flags
615 mov T0_32, [%1] ; load the guest flags
616 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
617 and T0_32, (%2 | %3) ; select the modified and undefined flags.
618 or [xSP], T0 ; merge guest flags with host flags.
619 popf ; load the mixed flags.
620 %endif
621 %endif
622%endmacro
623
624;;
625; Load the relevant flags from [%1].
626;
627; @remarks Clobbers T0, stack. Changes EFLAGS.
628; @param 1 The parameter (A0..A3) pointing to the eflags.
629; @param 2 The set of flags to load.
630; @param 3 The set of undefined flags.
631;
632%macro IEM_LOAD_FLAGS_OLD 3
633 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
634 pushf ; store current flags
635 mov T0_32, [%1] ; load the guest flags
636 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
637 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
638 or [xSP], T0 ; merge guest flags with host flags.
639 popf ; load the mixed flags.
640
641 %elif 1 ; This approach seems faster on intel 10980XE
642 %if (%3 | %2) == X86_EFL_CF
643 ; Use bt to load bit into CF
644 bt dword [%1], X86_EFL_CF_BIT
645 %else
646 mov eax, [%1] ; ASSUMES T0_32 is eax!!
647 %if (%3 | %2) == X86_EFL_OF
648 ; Use ADD to set OF.
649 shl eax, 31 - X86_EFL_OF_BIT
650 add eax, 80000000h
651 %elif ((%3 | %2) & X86_EFL_OF) != 0
652 ; Use ADD to set OF.
653 xchg al, ah
654 shl al, 15 - X86_EFL_OF_BIT
655 add al, 80h
656 ; Use SAHF to set the other status flags.
657 sahf
658 %else ; OF not needed; so al -> ah and load ah into eflags.
659 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
660 shl eax, 8
661 %else
662 xchg al, ah
663 %endif
664 sahf
665 %endif
666 %endif ; (%3 | %2) != X86_EFL_CF
667
668 %else
669 pushf ; store current flags
670 mov T0_32, [%1] ; load the guest flags
671 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
672 and T0_32, (%2 | %3) ; select the modified and undefined flags.
673 or [xSP], T0 ; merge guest flags with host flags.
674 popf ; load the mixed flags.
675 %endif
676%endmacro
677
678;;
679; Update the flag.
680;
681; @remarks Clobbers T0, T1, stack.
682; @param 1 The register pointing to the EFLAGS.
683; @param 2 The mask of modified flags to save.
684; @param 3 The mask of undefined flags to (maybe) save.
685; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
686;
687%macro IEM_SAVE_FLAGS_OLD 4 0
688 %if (%2 | %3 | %4) != 0
689 mov T1_32, [%1] ; flags
690 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
691 pushf
692 pop T0
693 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
694 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
695 %else
696 %if (%2 | %3 | %4) == X86_EFL_CF
697 setc T0_8
698 %elif (%2 | %3) == X86_EFL_OF
699 seto T0_8
700 shl T0_32, X86_EFL_OF_BIT
701 %elif (%2 | %3) == X86_EFL_ZF
702 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
703 shl T0_32, X86_EFL_ZF_BIT
704 %elif (%2 | %3) <= 0xff
705 lahf
706 movzx eax, ah ; ASSUMES T0_32 is eax!
707 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
708 lahf ; while there seems only to be a tiny advantage in most other test.
709 movzx eax, ah ; ASSUMES T0_32 is eax!
710 jno .of_is_clear
711 or eax, X86_EFL_OF
712.of_is_clear:
713 %else
714 pushf ; this is a bit slow
715 pop T0
716 %endif
717 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
718 and T0_32, (%2 | %3) ; select the modified and undefined flags.
719 %endif
720 or T0_32, T1_32 ; combine the flags.
721 mov [%1], T0_32 ; save the flags.
722 %endif
723%endmacro
724
725;;
726; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
727;
728; @remarks Clobbers T0, T1, stack.
729; @param 1 The register pointing to the EFLAGS.
730; @param 2 The mask of modified flags to save.
731; @param 3 Mask of additional flags to always clear
732; @param 4 Mask of additional flags to always set.
733;
734%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
735 %if (%2 | %3 | %4) != 0
736 pushf
737 pop T1
738 mov T0_32, [%1] ; load flags.
739 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
740 and T1_32, (%2) ; select the modified flags.
741 or T0_32, T1_32 ; combine the flags.
742 %if (%4) != 0
743 or T0_32, %4 ; add the always set flags.
744 %endif
745 mov [%1], T0_32 ; save the result.
746 %endif
747%endmacro
748
749;;
750; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
751; signed input (%4[%5]) and parity index (%6).
752;
753; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
754; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
755; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
756;
757; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
758; @param 1 The register pointing to the EFLAGS.
759; @param 2 The mask of modified flags to save.
760; @param 3 Mask of additional flags to always clear
761; @param 4 The result register to set SF by.
762; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
763; @param 6 The (full) register containing the parity table index. Will be modified!
764
765%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
766 %ifdef RT_ARCH_AMD64
767 pushf
768 pop T2
769 %else
770 push T0
771 pushf
772 pop T0
773 %endif
774 mov T1_32, [%1] ; load flags.
775 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
776 %ifdef RT_ARCH_AMD64
777 and T2_32, (%2) ; select the modified flags.
778 or T1_32, T2_32 ; combine the flags.
779 %else
780 and T0_32, (%2) ; select the modified flags.
781 or T1_32, T0_32 ; combine the flags.
782 pop T0
783 %endif
784
785 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
786 bt %4, %5 - 1
787 jnc %%sf_clear
788 or T1_32, X86_EFL_SF
789 %%sf_clear:
790
791 ; Parity last.
792 and %6, 0xff
793 %ifdef RT_ARCH_AMD64
794 lea T2, [NAME(g_afParity) xWrtRIP]
795 or T1_8, [T2 + %6]
796 %else
797 or T1_8, [NAME(g_afParity) + %6]
798 %endif
799
800 mov [%1], T1_32 ; save the result.
801%endmacro
802
803;;
804; Calculates the new EFLAGS using fixed clear and set bit masks.
805;
806; @remarks Clobbers T0.
807; @param 1 The register pointing to the EFLAGS.
808; @param 2 Mask of additional flags to always clear
809; @param 3 Mask of additional flags to always set.
810;
811%macro IEM_ADJUST_FLAGS_OLD 3
812 %if (%2 | %3) != 0
813 mov T0_32, [%1] ; Load flags.
814 %if (%2) != 0
815 and T0_32, ~(%2) ; Remove the always cleared flags.
816 %endif
817 %if (%3) != 0
818 or T0_32, %3 ; Add the always set flags.
819 %endif
820 mov [%1], T0_32 ; Save the result.
821 %endif
822%endmacro
823
824;;
825; Calculates the new EFLAGS using fixed clear and set bit masks.
826;
827; @remarks Clobbers T0, %4, EFLAGS.
828; @param 1 The register pointing to the EFLAGS.
829; @param 2 Mask of additional flags to always clear
830; @param 3 Mask of additional flags to always set.
831; @param 4 The (full) register containing the parity table index. Will be modified!
832;
833%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
834 mov T0_32, [%1] ; Load flags.
835 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
836 %if (%3) != 0
837 or T0_32, %3 ; Add the always set flags.
838 %endif
839 and %4, 0xff
840 %ifdef RT_ARCH_AMD64
841 lea T2, [NAME(g_afParity) xWrtRIP]
842 or T0_8, [T2 + %4]
843 %else
844 or T0_8, [NAME(g_afParity) + %4]
845 %endif
846 mov [%1], T0_32 ; Save the result.
847%endmacro
848
849
850
851;;
852; Loads register with offset of imm8 instruction -- used by all of the instruction
853; implementations which lay out jump tables of 256x immediate byte variants.
854; Also checks that the instruction size matches the offsets in the table.
855;
856; @param 1 The register to receive the jump target address (T1).
857; @param 2 The register containing the imm8 index (A1 / A2 / A3).
858; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
859; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
860; (implementation artifacts of each instruction jump table).
861;
862; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
863;
864%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
865 lea %1, [.imm0 xWrtRIP]
866 %if %3 == 5
867 lea T0, [%2 + %2*4] ; *5
868 lea %1, [%1 + T0] ; *5 + .imm0
869 %elif %3 == 6
870 lea T0, [%2 + %2*2] ; *3
871 lea %1, [%1 + T0*2] ; *6 + .imm0
872 %elif %3 == 7
873 lea T0, [%2 + %2*2] ; *3
874 lea T0, [T0 + %2*4] ; *7
875 lea %1, [%1 + T0] ; *7 + .imm0
876 %elif %3 == 8
877 lea %1, [%1 + %2*8] ; *8 + .imm0
878 %elif %3 == 9
879 lea T0, [%2 + %2*8] ; *9
880 lea %1, [%1 + T0] ; *9 + .imm0
881 %elif %3 == 10
882 lea T0, [%2 + %2*4] ; *5
883 lea %1, [%1 + T0*2] ; *10 + .imm0
884 %elif %3 == 11
885 lea T0, [%2 + %2*4] ; *5
886 lea T0, [%2 + T0*2] ; *11
887 lea %1, [%1 + T0] ; *11 + .imm0
888 %elif %3 == 12
889 lea T0, [%2 + %2*2] ; *3
890 lea %1, [%1 + T0*4] ; *12 + .imm0
891 %else
892 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
893 %endif
894 ; check size: 'warning: value does not fit in 8 bit field' if bad
895 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
896 (.imm1 - .imm0 + %3)
897 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
898 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
899 ((.immEnd - .imm0) - 256 * %3)
900%endmacro
901
902%macro IEMIMPL_JUMP_TABLE_TARGET 3
903 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
904 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
905 %else
906 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
907 %endif
908%endmacro
909
910
911;;
912; Calls the given imm8 instruction -- used by all of the instruction
913; implementations which lay out jump tables of 256x immediate byte variants.
914;
915; @param 1 The register to receive the jump target address (T1).
916; @param 2 The register containing the imm8 index (A1 / A2 / A3).
917; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
918;
919; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
920; `IBT_NOTRACK, call %1`.
921;
922%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
923 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
924 IBT_NOTRACK
925 call %1
926%endmacro
927
928
929;*********************************************************************************************************************************
930;* External Symbols *
931;*********************************************************************************************************************************
932extern NAME(g_afParity)
933
934
935;;
936; Macro for implementing a binary operator.
937;
938; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
939; variants, except on 32-bit system where the 64-bit accesses requires hand
940; coding.
941;
942; All the functions takes a pointer to the destination memory operand in A0,
943; the source register operand in A1 and a pointer to eflags in A2.
944;
945; @param 1 The instruction mnemonic.
946; @param 2 Non-zero if there should be a locked version.
947; @param 3 The modified flags.
948; @param 4 The undefined flags.
949; @param 5 The flags that must be loaded (ADC, SBC).
950; @param 6 The flags that will be zeroed by the operation.
951;
952%macro IEMIMPL_BIN_OP 6
953BEGINCODE
954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
955 PROLOGUE_3_ARGS
956 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
957 %1 byte [A1], A2_8
958 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
959 EPILOGUE_3_ARGS
960ENDPROC iemAImpl_ %+ %1 %+ _u8
961
962BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
963 PROLOGUE_3_ARGS
964 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
965 %1 word [A1], A2_16
966 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
971 PROLOGUE_3_ARGS
972 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
973 %1 dword [A1], A2_32
974 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
975 EPILOGUE_3_ARGS
976ENDPROC iemAImpl_ %+ %1 %+ _u32
977
978 %ifdef RT_ARCH_AMD64
979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
980 PROLOGUE_3_ARGS
981 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
982 %1 qword [A1], A2
983 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
984 EPILOGUE_3_ARGS_EX 8
985ENDPROC iemAImpl_ %+ %1 %+ _u64
986 %endif ; RT_ARCH_AMD64
987
988 %if %2 != 0 ; locked versions requested?
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
991 PROLOGUE_3_ARGS
992 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
993 lock %1 byte [A1], A2_8
994 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
995 EPILOGUE_3_ARGS
996ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
997
998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
999 PROLOGUE_3_ARGS
1000 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1001 lock %1 word [A1], A2_16
1002 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1003 EPILOGUE_3_ARGS
1004ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1005
1006BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1007 PROLOGUE_3_ARGS
1008 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1009 lock %1 dword [A1], A2_32
1010 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1011 EPILOGUE_3_ARGS
1012ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1013
1014 %ifdef RT_ARCH_AMD64
1015BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1016 PROLOGUE_3_ARGS
1017 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1018 lock %1 qword [A1], A2
1019 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1022 %endif ; RT_ARCH_AMD64
1023 %endif ; locked
1024%endmacro
1025
1026; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1027IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1028IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1029IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1030IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1031IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1032IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1036
1037
1038;;
1039; Macro for implementing a binary operator, VEX variant with separate input/output.
1040;
1041; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1042; where the 64-bit accesses requires hand coding.
1043;
1044; All the functions takes a pointer to the destination memory operand in A0,
1045; the first source register operand in A1, the second source register operand
1046; in A2 and a pointer to eflags in A3.
1047;
1048; @param 1 The instruction mnemonic.
1049; @param 2 The modified flags.
1050; @param 3 The undefined flags.
1051; @param 4 The zeroed flags.
1052;
1053%macro IEMIMPL_VEX_BIN_OP 4
1054BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1055 PROLOGUE_4_ARGS
1056 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1057 %1 T0_32, A1_32, A2_32
1058 mov [A0], T0_32
1059 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1060 EPILOGUE_4_ARGS
1061ENDPROC iemAImpl_ %+ %1 %+ _u32
1062
1063 %ifdef RT_ARCH_AMD64
1064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1065 PROLOGUE_4_ARGS
1066 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1067 %1 T0, A1, A2
1068 mov [A0], T0
1069 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1070 EPILOGUE_4_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64
1072 %endif ; RT_ARCH_AMD64
1073%endmacro
1074
1075; instr, modified-flags, undefined-flags, zeroed-flags
1076IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1078IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1079
1080;;
1081; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1082;
1083; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1084; where the 64-bit accesses requires hand coding.
1085;
1086; All the functions takes a pointer to the destination memory operand in A1,
1087; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1088; are returned in EAX.
1089;
1090; @param 1 The instruction mnemonic.
1091; @param 2 The modified flags.
1092; @param 3 The undefined flags.
1093; @param 4 The zeroed flags.
1094;
1095%macro IEMIMPL_VEX_BIN_OP_2 4
1096BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1097 PROLOGUE_4_ARGS
1098 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1099 mov T0_32, [A1]
1100 %1 T0_32, A2_32
1101 mov [A1], T0_32
1102 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1103 EPILOGUE_4_ARGS
1104ENDPROC iemAImpl_ %+ %1 %+ _u32
1105
1106 %ifdef RT_ARCH_AMD64
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1108 PROLOGUE_4_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1110 mov T0, [A1]
1111 %1 T0, A2
1112 mov [A1], T0
1113 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1114 EPILOGUE_4_ARGS
1115ENDPROC iemAImpl_ %+ %1 %+ _u64
1116 %endif ; RT_ARCH_AMD64
1117%endmacro
1118
1119; instr, modified-flags, undefined-flags zeroed-flags
1120IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1123
1124
1125;;
1126; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1127;
1128; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1129; where the 64-bit accesses requires hand coding.
1130;
1131; All the functions takes a pointer to the destination memory operand in A0,
1132; the first source register operand in A1, the second source register operand
1133; in A2 and a pointer to eflags in A3.
1134;
1135; @param 1 The instruction mnemonic.
1136; @param 2 Fallback instruction if applicable.
1137; @param 3 Whether to emit fallback or not.
1138;
1139%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1140BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1141 PROLOGUE_3_ARGS
1142 %1 T0_32, A1_32, A2_32
1143 mov [A0], T0_32
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_ %+ %1 %+ _u32
1146
1147 %if %3
1148BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1149 PROLOGUE_3_ARGS
1150 %ifdef ASM_CALL64_GCC
1151 mov cl, A2_8
1152 %2 A1_32, cl
1153 mov [A0], A1_32
1154 %else
1155 xchg A2, A0
1156 %2 A1_32, cl
1157 mov [A2], A1_32
1158 %endif
1159 EPILOGUE_3_ARGS
1160ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1161 %endif
1162
1163 %ifdef RT_ARCH_AMD64
1164BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1165 PROLOGUE_3_ARGS
1166 %1 T0, A1, A2
1167 mov [A0], T0
1168 EPILOGUE_3_ARGS
1169ENDPROC iemAImpl_ %+ %1 %+ _u64
1170
1171 %if %3
1172BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1173 PROLOGUE_3_ARGS
1174 %ifdef ASM_CALL64_GCC
1175 mov cl, A2_8
1176 %2 A1, cl
1177 mov [A0], A1_32
1178 %else
1179 xchg A2, A0
1180 %2 A1, cl
1181 mov [A2], A1_32
1182 %endif
1183 mov [A0], A1
1184 EPILOGUE_3_ARGS
1185ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1186 %endif
1187 %endif ; RT_ARCH_AMD64
1188%endmacro
1189
1190; instr, fallback instr, emit fallback
1191IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1194IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1195IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1196
1197
1198;
1199; RORX uses a immediate byte for the shift count, so we only do
1200; fallback implementation of that one.
1201;
1202BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1203 PROLOGUE_3_ARGS
1204 %ifdef ASM_CALL64_GCC
1205 mov cl, A2_8
1206 ror A1_32, cl
1207 mov [A0], A1_32
1208 %else
1209 xchg A2, A0
1210 ror A1_32, cl
1211 mov [A2], A1_32
1212 %endif
1213 EPILOGUE_3_ARGS
1214ENDPROC iemAImpl_rorx_u32
1215
1216 %ifdef RT_ARCH_AMD64
1217BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1218 PROLOGUE_3_ARGS
1219 %ifdef ASM_CALL64_GCC
1220 mov cl, A2_8
1221 ror A1, cl
1222 mov [A0], A1
1223 %else
1224 xchg A2, A0
1225 ror A1, cl
1226 mov [A2], A1
1227 %endif
1228 EPILOGUE_3_ARGS
1229ENDPROC iemAImpl_rorx_u64
1230 %endif ; RT_ARCH_AMD64
1231
1232
1233;
1234; MULX
1235;
1236BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1237 PROLOGUE_4_ARGS
1238%ifdef ASM_CALL64_GCC
1239 ; A2_32 is EDX - prefect
1240 mulx T0_32, T1_32, A3_32
1241 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1242 mov [A0], T0_32
1243%else
1244 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1245 xchg A1, A2
1246 mulx T0_32, T1_32, A3_32
1247 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1248 mov [A0], T0_32
1249%endif
1250 EPILOGUE_4_ARGS
1251ENDPROC iemAImpl_mulx_u32
1252
1253
1254BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1255 PROLOGUE_4_ARGS
1256%ifdef ASM_CALL64_GCC
1257 ; A2_32 is EDX, T0_32 is EAX
1258 mov eax, A3_32
1259 mul A2_32
1260 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1261 mov [A0], edx
1262%else
1263 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1264 xchg A1, A2
1265 mov eax, A3_32
1266 mul A2_32
1267 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1268 mov [A0], edx
1269%endif
1270 EPILOGUE_4_ARGS
1271ENDPROC iemAImpl_mulx_u32_fallback
1272
1273%ifdef RT_ARCH_AMD64
1274BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1275 PROLOGUE_4_ARGS
1276%ifdef ASM_CALL64_GCC
1277 ; A2 is RDX - prefect
1278 mulx T0, T1, A3
1279 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1280 mov [A0], T0
1281%else
1282 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1283 xchg A1, A2
1284 mulx T0, T1, A3
1285 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1286 mov [A0], T0
1287%endif
1288 EPILOGUE_4_ARGS
1289ENDPROC iemAImpl_mulx_u64
1290
1291
1292BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1293 PROLOGUE_4_ARGS
1294%ifdef ASM_CALL64_GCC
1295 ; A2 is RDX, T0 is RAX
1296 mov rax, A3
1297 mul A2
1298 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1299 mov [A0], rdx
1300%else
1301 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1302 xchg A1, A2
1303 mov rax, A3
1304 mul A2
1305 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1306 mov [A0], rdx
1307%endif
1308 EPILOGUE_4_ARGS
1309ENDPROC iemAImpl_mulx_u64_fallback
1310
1311%endif
1312
1313
1314;;
1315; Macro for implementing a bit operator.
1316;
1317; This will generate code for the 16, 32 and 64 bit accesses with locked
1318; variants, except on 32-bit system where the 64-bit accesses requires hand
1319; coding.
1320;
1321; All the functions takes a pointer to the destination memory operand in A1,
1322; the source register operand in A2 and incoming eflags in A0.
1323;
1324; @param 1 The instruction mnemonic.
1325; @param 2 Non-zero if there should be a locked version.
1326; @param 3 The modified flags.
1327; @param 4 The undefined flags.
1328;
1329%macro IEMIMPL_BIT_OP 4
1330BEGINCODE
1331BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1332 PROLOGUE_3_ARGS
1333 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1334 %1 word [A1], A2_16
1335 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1336 EPILOGUE_3_ARGS
1337ENDPROC iemAImpl_ %+ %1 %+ _u16
1338
1339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1340 PROLOGUE_3_ARGS
1341 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1342 %1 dword [A1], A2_32
1343 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1344 EPILOGUE_3_ARGS
1345ENDPROC iemAImpl_ %+ %1 %+ _u32
1346
1347 %ifdef RT_ARCH_AMD64
1348BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1351 %1 qword [A1], A2
1352 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1353 EPILOGUE_3_ARGS_EX 8
1354ENDPROC iemAImpl_ %+ %1 %+ _u64
1355 %endif ; RT_ARCH_AMD64
1356
1357 %if %2 != 0 ; locked versions requested?
1358
1359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1362 lock %1 word [A1], A2_16
1363 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1364 EPILOGUE_3_ARGS
1365ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1366
1367BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1368 PROLOGUE_3_ARGS
1369 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1370 lock %1 dword [A1], A2_32
1371 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1374
1375 %ifdef RT_ARCH_AMD64
1376BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1377 PROLOGUE_3_ARGS
1378 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1379 lock %1 qword [A1], A2
1380 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1381 EPILOGUE_3_ARGS_EX 8
1382ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1383 %endif ; RT_ARCH_AMD64
1384 %endif ; locked
1385%endmacro
1386
1387; Undefined flags are passed thru here by the intel and amd CPUs we have.
1388; modified efl, undefined eflags
1389IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1393
1394;;
1395; Macro for implementing a bit search operator.
1396;
1397; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1398; system where the 64-bit accesses requires hand coding.
1399;
1400; All the functions takes a pointer to the destination memory operand in A1,
1401; the source register operand in A2 and the incoming eflags in A0.
1402;
1403; In the ZF case the destination register is 'undefined', however it seems that
1404; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1405; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1406; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1407; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1408;
1409; Intel: Clear all and calculate PF in addition to ZF.
1410; AMD: Passthru all flags other than ZF.
1411;
1412; @param 1 The instruction mnemonic.
1413; @param 2 The modified flags.
1414; @param 3 The undefined flags.
1415; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1416;
1417%macro IEMIMPL_BIT_OP2 4
1418BEGINCODE
1419; 16-bit
1420
1421BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1422 PROLOGUE_3_ARGS
1423 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1424 %1 T0_16, A2_16
1425%if %4 != 0
1426 jz .unchanged_dst
1427%endif
1428 mov [A1], T0_16
1429.unchanged_dst:
1430 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1431 EPILOGUE_3_ARGS
1432ENDPROC iemAImpl_ %+ %1 %+ _u16
1433
1434;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1435;bad; PROLOGUE_3_ARGS
1436;bad; %1 T1_16, A1_16
1437;bad; jz .unchanged_dst
1438;bad; mov [A0], T1_16
1439;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1440;bad; EPILOGUE_3_ARGS
1441;bad;.unchanged_dst:
1442;bad;%if %4 != 0
1443;bad; mov [A0], T1_16
1444;bad;%endif
1445;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1446;bad; EPILOGUE_3_ARGS
1447;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1448;bad;
1449;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1450;bad; PROLOGUE_3_ARGS
1451;bad; %1 T0_16, A1_16
1452;bad;%if %4 != 0
1453;bad; jz .unchanged_dst
1454;bad;%endif
1455;bad; mov [A0], T0_16
1456;bad;.unchanged_dst:
1457;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1458;bad; EPILOGUE_3_ARGS
1459;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1460
1461; 32-bit
1462
1463BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1464 PROLOGUE_3_ARGS
1465 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1466 %1 T0_32, A2_32
1467%if %4 != 0
1468 jz .unchanged_dst
1469%endif
1470 mov [A1], T0_32
1471.unchanged_dst:
1472 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1473 EPILOGUE_3_ARGS
1474ENDPROC iemAImpl_ %+ %1 %+ _u32
1475
1476;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1477;bad; PROLOGUE_3_ARGS
1478;bad; %1 T1_32, A1_32
1479;bad;%if %4 != 0
1480;bad; jz .unchanged_dst
1481;bad;%endif
1482;bad; mov [A0], T1_32
1483;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1484;bad; EPILOGUE_3_ARGS
1485;bad;.unchanged_dst:
1486;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1487;bad; EPILOGUE_3_ARGS
1488;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1489;bad;
1490;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1491;bad; PROLOGUE_3_ARGS
1492;bad; %1 T0_32, A1_32
1493;bad;%if %4 != 0
1494;bad; jz .unchanged_dst
1495;bad;%endif
1496;bad; mov [A0], T0_32
1497;bad;.unchanged_dst:
1498;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1499;bad; EPILOGUE_3_ARGS
1500;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1501
1502
1503 %ifdef RT_ARCH_AMD64
1504; 64-bit
1505
1506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1507 PROLOGUE_3_ARGS
1508 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1509 %1 T0, A2
1510%if %4 != 0
1511 jz .unchanged_dst
1512%endif
1513 mov [A1], T0
1514.unchanged_dst:
1515 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1516 EPILOGUE_3_ARGS_EX 8
1517ENDPROC iemAImpl_ %+ %1 %+ _u64
1518
1519;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1520;bad; PROLOGUE_3_ARGS
1521;bad; %1 T1, A1
1522;bad;%if %4 != 0
1523;bad; jz .unchanged_dst
1524;bad;%endif
1525;bad; mov [A0], T1
1526;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1527;bad; EPILOGUE_3_ARGS
1528;bad;.unchanged_dst:
1529;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1530;bad; EPILOGUE_3_ARGS
1531;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1532;bad;
1533;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1534;bad; PROLOGUE_3_ARGS
1535;bad; %1 T0, A1
1536;bad;%if %4 != 0
1537;bad; jz .unchanged_dst
1538;bad;%endif
1539;bad; mov [A0], T0
1540;bad;.unchanged_dst:
1541;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1542;bad; EPILOGUE_3_ARGS_EX 8
1543;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1544
1545 %endif ; RT_ARCH_AMD64
1546%endmacro
1547
1548IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1550IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1552
1553
1554;;
1555; Macro for implementing POPCNT.
1556;
1557; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1558; system where the 64-bit accesses requires hand coding.
1559;
1560; All the functions takes a pointer to the destination memory operand in A1,
1561; the source register operand in A2 and eflags in A0.
1562;
1563; ASSUMES Intel and AMD set EFLAGS the same way.
1564;
1565; ASSUMES the instruction does not support memory destination.
1566;
1567; @param 1 The instruction mnemonic.
1568; @param 2 The modified flags.
1569; @param 3 The undefined flags.
1570; @param 4 The zeroed flags.
1571;
1572%macro IEMIMPL_BIT_OP3 4
1573BEGINCODE
1574BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1575 PROLOGUE_3_ARGS
1576 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1577 %1 T0_16, A2_16
1578 mov [A1], T0_16
1579 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1580 EPILOGUE_3_ARGS
1581ENDPROC iemAImpl_ %+ %1 %+ _u16
1582
1583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1584 PROLOGUE_3_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1586 %1 T0_32, A2_32
1587 mov [A1], T0_32
1588 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1589 EPILOGUE_3_ARGS
1590ENDPROC iemAImpl_ %+ %1 %+ _u32
1591
1592 %ifdef RT_ARCH_AMD64
1593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1594 PROLOGUE_3_ARGS
1595 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1596 %1 T0, A2
1597 mov [A1], T0
1598 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1599 EPILOGUE_3_ARGS_EX 8
1600ENDPROC iemAImpl_ %+ %1 %+ _u64
1601 %endif ; RT_ARCH_AMD64
1602%endmacro
1603IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1604
1605
1606;
1607; IMUL is also a similar but yet different case (no lock, no mem dst).
1608; The rDX:rAX variant of imul is handled together with mul further down.
1609;
1610BEGINCODE
1611; @param 1 EFLAGS that are modified.
1612; @param 2 Undefined EFLAGS.
1613; @param 3 Function suffix.
1614; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1615; 2 for AMD (set AF, clear PF, ZF and SF).
1616%macro IEMIMPL_IMUL_TWO 4
1617BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1618 PROLOGUE_3_ARGS
1619 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1620 imul A2_16, word [A1]
1621 mov [A1], A2_16
1622 %if %4 != 1
1623 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1624 %else
1625 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1626 %endif
1627 EPILOGUE_3_ARGS
1628ENDPROC iemAImpl_imul_two_u16 %+ %3
1629
1630BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1631 PROLOGUE_3_ARGS
1632 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1633 imul A2_32, dword [A1]
1634 mov [A1], A2_32
1635 %if %4 != 1
1636 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1637 %else
1638 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1639 %endif
1640 EPILOGUE_3_ARGS
1641ENDPROC iemAImpl_imul_two_u32 %+ %3
1642
1643 %ifdef RT_ARCH_AMD64
1644BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1645 PROLOGUE_3_ARGS
1646 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1647 imul A2, qword [A1]
1648 mov [A1], A2
1649 %if %4 != 1
1650 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1651 %else
1652 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1653 %endif
1654 EPILOGUE_3_ARGS_EX 8
1655ENDPROC iemAImpl_imul_two_u64 %+ %3
1656 %endif ; RT_ARCH_AMD64
1657%endmacro
1658; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1659; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1660; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1663IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1664
1665
1666;
1667; XCHG for memory operands. This implies locking. No flag changes.
1668;
1669; Each function takes two arguments, first the pointer to the memory,
1670; then the pointer to the register. They all return void.
1671;
1672BEGINCODE
1673BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1674 PROLOGUE_2_ARGS
1675 mov T0_8, [A1]
1676 xchg [A0], T0_8
1677 mov [A1], T0_8
1678 EPILOGUE_2_ARGS
1679ENDPROC iemAImpl_xchg_u8_locked
1680
1681BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1682 PROLOGUE_2_ARGS
1683 mov T0_16, [A1]
1684 xchg [A0], T0_16
1685 mov [A1], T0_16
1686 EPILOGUE_2_ARGS
1687ENDPROC iemAImpl_xchg_u16_locked
1688
1689BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1690 PROLOGUE_2_ARGS
1691 mov T0_32, [A1]
1692 xchg [A0], T0_32
1693 mov [A1], T0_32
1694 EPILOGUE_2_ARGS
1695ENDPROC iemAImpl_xchg_u32_locked
1696
1697%ifdef RT_ARCH_AMD64
1698BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1699 PROLOGUE_2_ARGS
1700 mov T0, [A1]
1701 xchg [A0], T0
1702 mov [A1], T0
1703 EPILOGUE_2_ARGS
1704ENDPROC iemAImpl_xchg_u64_locked
1705%endif
1706
1707; Unlocked variants for fDisregardLock mode.
1708
1709BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1710 PROLOGUE_2_ARGS
1711 mov T0_8, [A1]
1712 mov T1_8, [A0]
1713 mov [A0], T0_8
1714 mov [A1], T1_8
1715 EPILOGUE_2_ARGS
1716ENDPROC iemAImpl_xchg_u8_unlocked
1717
1718BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1719 PROLOGUE_2_ARGS
1720 mov T0_16, [A1]
1721 mov T1_16, [A0]
1722 mov [A0], T0_16
1723 mov [A1], T1_16
1724 EPILOGUE_2_ARGS
1725ENDPROC iemAImpl_xchg_u16_unlocked
1726
1727BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1728 PROLOGUE_2_ARGS
1729 mov T0_32, [A1]
1730 mov T1_32, [A0]
1731 mov [A0], T0_32
1732 mov [A1], T1_32
1733 EPILOGUE_2_ARGS
1734ENDPROC iemAImpl_xchg_u32_unlocked
1735
1736%ifdef RT_ARCH_AMD64
1737BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1738 PROLOGUE_2_ARGS
1739 mov T0, [A1]
1740 mov T1, [A0]
1741 mov [A0], T0
1742 mov [A1], T1
1743 EPILOGUE_2_ARGS
1744ENDPROC iemAImpl_xchg_u64_unlocked
1745%endif
1746
1747
1748;
1749; XADD for memory operands.
1750;
1751; Each function takes three arguments, first the pointer to the
1752; memory/register, then the pointer to the register, and finally a pointer to
1753; eflags. They all return void.
1754;
1755BEGINCODE
1756BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1757 PROLOGUE_3_ARGS
1758 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1759 mov T0_8, [A1]
1760 xadd [A0], T0_8
1761 mov [A1], T0_8
1762 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1763 EPILOGUE_3_ARGS
1764ENDPROC iemAImpl_xadd_u8
1765
1766BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1767 PROLOGUE_3_ARGS
1768 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1769 mov T0_16, [A1]
1770 xadd [A0], T0_16
1771 mov [A1], T0_16
1772 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1773 EPILOGUE_3_ARGS
1774ENDPROC iemAImpl_xadd_u16
1775
1776BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1777 PROLOGUE_3_ARGS
1778 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1779 mov T0_32, [A1]
1780 xadd [A0], T0_32
1781 mov [A1], T0_32
1782 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1783 EPILOGUE_3_ARGS
1784ENDPROC iemAImpl_xadd_u32
1785
1786%ifdef RT_ARCH_AMD64
1787BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1788 PROLOGUE_3_ARGS
1789 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1790 mov T0, [A1]
1791 xadd [A0], T0
1792 mov [A1], T0
1793 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1794 EPILOGUE_3_ARGS
1795ENDPROC iemAImpl_xadd_u64
1796%endif ; RT_ARCH_AMD64
1797
1798BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1799 PROLOGUE_3_ARGS
1800 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1801 mov T0_8, [A1]
1802 lock xadd [A0], T0_8
1803 mov [A1], T0_8
1804 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1805 EPILOGUE_3_ARGS
1806ENDPROC iemAImpl_xadd_u8_locked
1807
1808BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1809 PROLOGUE_3_ARGS
1810 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1811 mov T0_16, [A1]
1812 lock xadd [A0], T0_16
1813 mov [A1], T0_16
1814 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1815 EPILOGUE_3_ARGS
1816ENDPROC iemAImpl_xadd_u16_locked
1817
1818BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1819 PROLOGUE_3_ARGS
1820 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1821 mov T0_32, [A1]
1822 lock xadd [A0], T0_32
1823 mov [A1], T0_32
1824 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1825 EPILOGUE_3_ARGS
1826ENDPROC iemAImpl_xadd_u32_locked
1827
1828%ifdef RT_ARCH_AMD64
1829BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1830 PROLOGUE_3_ARGS
1831 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1832 mov T0, [A1]
1833 lock xadd [A0], T0
1834 mov [A1], T0
1835 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1836 EPILOGUE_3_ARGS
1837ENDPROC iemAImpl_xadd_u64_locked
1838%endif ; RT_ARCH_AMD64
1839
1840
1841;
1842; CMPXCHG8B.
1843;
1844; These are tricky register wise, so the code is duplicated for each calling
1845; convention.
1846;
1847; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1848;
1849; C-proto:
1850; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1851; uint32_t *pEFlags));
1852;
1853; Note! Identical to iemAImpl_cmpxchg16b.
1854;
1855BEGINCODE
1856BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1857%ifdef RT_ARCH_AMD64
1858 %ifdef ASM_CALL64_MSC
1859 push rbx
1860
1861 mov r11, rdx ; pu64EaxEdx (is also T1)
1862 mov r10, rcx ; pu64Dst
1863
1864 mov ebx, [r8]
1865 mov ecx, [r8 + 4]
1866 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1867 mov eax, [r11]
1868 mov edx, [r11 + 4]
1869
1870 cmpxchg8b [r10]
1871
1872 mov [r11], eax
1873 mov [r11 + 4], edx
1874 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1875
1876 pop rbx
1877 ret
1878 %else
1879 push rbx
1880
1881 mov r10, rcx ; pEFlags
1882 mov r11, rdx ; pu64EbxEcx (is also T1)
1883
1884 mov ebx, [r11]
1885 mov ecx, [r11 + 4]
1886 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1887 mov eax, [rsi]
1888 mov edx, [rsi + 4]
1889
1890 cmpxchg8b [rdi]
1891
1892 mov [rsi], eax
1893 mov [rsi + 4], edx
1894 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1895
1896 pop rbx
1897 ret
1898
1899 %endif
1900%else
1901 push esi
1902 push edi
1903 push ebx
1904 push ebp
1905
1906 mov edi, ecx ; pu64Dst
1907 mov esi, edx ; pu64EaxEdx
1908 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1909 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1910
1911 mov ebx, [ecx]
1912 mov ecx, [ecx + 4]
1913 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1914 mov eax, [esi]
1915 mov edx, [esi + 4]
1916
1917 cmpxchg8b [edi]
1918
1919 mov [esi], eax
1920 mov [esi + 4], edx
1921 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1922
1923 pop ebp
1924 pop ebx
1925 pop edi
1926 pop esi
1927 ret 8
1928%endif
1929ENDPROC iemAImpl_cmpxchg8b
1930
1931BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1932%ifdef RT_ARCH_AMD64
1933 %ifdef ASM_CALL64_MSC
1934 push rbx
1935
1936 mov r11, rdx ; pu64EaxEdx (is also T1)
1937 mov r10, rcx ; pu64Dst
1938
1939 mov ebx, [r8]
1940 mov ecx, [r8 + 4]
1941 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1942 mov eax, [r11]
1943 mov edx, [r11 + 4]
1944
1945 lock cmpxchg8b [r10]
1946
1947 mov [r11], eax
1948 mov [r11 + 4], edx
1949 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1950
1951 pop rbx
1952 ret
1953 %else
1954 push rbx
1955
1956 mov r10, rcx ; pEFlags
1957 mov r11, rdx ; pu64EbxEcx (is also T1)
1958
1959 mov ebx, [r11]
1960 mov ecx, [r11 + 4]
1961 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1962 mov eax, [rsi]
1963 mov edx, [rsi + 4]
1964
1965 lock cmpxchg8b [rdi]
1966
1967 mov [rsi], eax
1968 mov [rsi + 4], edx
1969 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1970
1971 pop rbx
1972 ret
1973
1974 %endif
1975%else
1976 push esi
1977 push edi
1978 push ebx
1979 push ebp
1980
1981 mov edi, ecx ; pu64Dst
1982 mov esi, edx ; pu64EaxEdx
1983 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1984 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1985
1986 mov ebx, [ecx]
1987 mov ecx, [ecx + 4]
1988 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1989 mov eax, [esi]
1990 mov edx, [esi + 4]
1991
1992 lock cmpxchg8b [edi]
1993
1994 mov [esi], eax
1995 mov [esi + 4], edx
1996 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1997
1998 pop ebp
1999 pop ebx
2000 pop edi
2001 pop esi
2002 ret 8
2003%endif
2004ENDPROC iemAImpl_cmpxchg8b_locked
2005
2006%ifdef RT_ARCH_AMD64
2007
2008;
2009; CMPXCHG16B.
2010;
2011; These are tricky register wise, so the code is duplicated for each calling
2012; convention.
2013;
2014; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2015;
2016; C-proto:
2017; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2018; uint32_t *pEFlags));
2019;
2020; Note! Identical to iemAImpl_cmpxchg8b.
2021;
2022BEGINCODE
2023BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2024 %ifdef ASM_CALL64_MSC
2025 push rbx
2026
2027 mov r11, rdx ; pu64RaxRdx (is also T1)
2028 mov r10, rcx ; pu64Dst
2029
2030 mov rbx, [r8]
2031 mov rcx, [r8 + 8]
2032 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2033 mov rax, [r11]
2034 mov rdx, [r11 + 8]
2035
2036 cmpxchg16b [r10]
2037
2038 mov [r11], rax
2039 mov [r11 + 8], rdx
2040 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2041
2042 pop rbx
2043 ret
2044 %else
2045 push rbx
2046
2047 mov r10, rcx ; pEFlags
2048 mov r11, rdx ; pu64RbxRcx (is also T1)
2049
2050 mov rbx, [r11]
2051 mov rcx, [r11 + 8]
2052 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2053 mov rax, [rsi]
2054 mov rdx, [rsi + 8]
2055
2056 cmpxchg16b [rdi]
2057
2058 mov [rsi], rax
2059 mov [rsi + 8], rdx
2060 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2061
2062 pop rbx
2063 ret
2064
2065 %endif
2066ENDPROC iemAImpl_cmpxchg16b
2067
2068BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2069 %ifdef ASM_CALL64_MSC
2070 push rbx
2071
2072 mov r11, rdx ; pu64RaxRdx (is also T1)
2073 mov r10, rcx ; pu64Dst
2074
2075 mov rbx, [r8]
2076 mov rcx, [r8 + 8]
2077 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2078 mov rax, [r11]
2079 mov rdx, [r11 + 8]
2080
2081 lock cmpxchg16b [r10]
2082
2083 mov [r11], rax
2084 mov [r11 + 8], rdx
2085 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2086
2087 pop rbx
2088 ret
2089 %else
2090 push rbx
2091
2092 mov r10, rcx ; pEFlags
2093 mov r11, rdx ; pu64RbxRcx (is also T1)
2094
2095 mov rbx, [r11]
2096 mov rcx, [r11 + 8]
2097 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2098 mov rax, [rsi]
2099 mov rdx, [rsi + 8]
2100
2101 lock cmpxchg16b [rdi]
2102
2103 mov [rsi], rax
2104 mov [rsi + 8], rdx
2105 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2106
2107 pop rbx
2108 ret
2109
2110 %endif
2111ENDPROC iemAImpl_cmpxchg16b_locked
2112
2113%endif ; RT_ARCH_AMD64
2114
2115
2116;
2117; CMPXCHG.
2118;
2119; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2120;
2121; C-proto:
2122; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2123;
2124BEGINCODE
2125%macro IEMIMPL_CMPXCHG 2
2126BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2127 PROLOGUE_4_ARGS
2128 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2129 mov al, [A1]
2130 %1 cmpxchg [A0], A2_8
2131 mov [A1], al
2132 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2133 EPILOGUE_4_ARGS
2134ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2135
2136BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2137 PROLOGUE_4_ARGS
2138 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2139 mov ax, [A1]
2140 %1 cmpxchg [A0], A2_16
2141 mov [A1], ax
2142 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2143 EPILOGUE_4_ARGS
2144ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2145
2146BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2147 PROLOGUE_4_ARGS
2148 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2149 mov eax, [A1]
2150 %1 cmpxchg [A0], A2_32
2151 mov [A1], eax
2152 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2153 EPILOGUE_4_ARGS
2154ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2155
2156BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2157%ifdef RT_ARCH_AMD64
2158 PROLOGUE_4_ARGS
2159 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2160 mov rax, [A1]
2161 %1 cmpxchg [A0], A2
2162 mov [A1], rax
2163 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2164 EPILOGUE_4_ARGS
2165%else
2166 ;
2167 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2168 ;
2169 push esi
2170 push edi
2171 push ebx
2172 push ebp
2173
2174 mov edi, ecx ; pu64Dst
2175 mov esi, edx ; pu64Rax
2176 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2177 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2178
2179 mov ebx, [ecx]
2180 mov ecx, [ecx + 4]
2181 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2182 mov eax, [esi]
2183 mov edx, [esi + 4]
2184
2185 lock cmpxchg8b [edi]
2186
2187 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2188 jz .cmpxchg8b_not_equal
2189;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2190 cmp eax, eax ; just set the other flags.
2191.store:
2192 mov [esi], eax
2193 mov [esi + 4], edx
2194 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2195
2196 pop ebp
2197 pop ebx
2198 pop edi
2199 pop esi
2200 ret 8
2201
2202.cmpxchg8b_not_equal:
2203 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2204 jne .store
2205 cmp [esi], eax
2206 jmp .store
2207
2208%endif
2209ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2210%endmacro ; IEMIMPL_CMPXCHG
2211
2212IEMIMPL_CMPXCHG , ,
2213IEMIMPL_CMPXCHG lock, _locked
2214
2215
2216
2217;;
2218; Macro for implementing a unary operator.
2219;
2220; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2221; variants, except on 32-bit system where the 64-bit accesses requires hand
2222; coding.
2223;
2224; All the functions takes a pointer to the destination memory operand in A0,
2225; the source register operand in A1 and a pointer to eflags in A2.
2226;
2227; @param 1 The instruction mnemonic.
2228; @param 2 The modified flags.
2229; @param 3 The undefined flags.
2230;
2231%macro IEMIMPL_UNARY_OP 3
2232BEGINCODE
2233BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2234 PROLOGUE_2_ARGS
2235 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2236 %1 byte [A0]
2237 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2238 EPILOGUE_2_ARGS
2239ENDPROC iemAImpl_ %+ %1 %+ _u8
2240
2241BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2242 PROLOGUE_2_ARGS
2243 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2244 lock %1 byte [A0]
2245 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2246 EPILOGUE_2_ARGS
2247ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2248
2249BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2250 PROLOGUE_2_ARGS
2251 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2252 %1 word [A0]
2253 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2254 EPILOGUE_2_ARGS
2255ENDPROC iemAImpl_ %+ %1 %+ _u16
2256
2257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2258 PROLOGUE_2_ARGS
2259 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2260 lock %1 word [A0]
2261 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2262 EPILOGUE_2_ARGS
2263ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2264
2265BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2266 PROLOGUE_2_ARGS
2267 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2268 %1 dword [A0]
2269 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2270 EPILOGUE_2_ARGS
2271ENDPROC iemAImpl_ %+ %1 %+ _u32
2272
2273BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2274 PROLOGUE_2_ARGS
2275 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2276 lock %1 dword [A0]
2277 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2278 EPILOGUE_2_ARGS
2279ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2280
2281 %ifdef RT_ARCH_AMD64
2282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2283 PROLOGUE_2_ARGS
2284 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2285 %1 qword [A0]
2286 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2287 EPILOGUE_2_ARGS
2288ENDPROC iemAImpl_ %+ %1 %+ _u64
2289
2290BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2291 PROLOGUE_2_ARGS
2292 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2293 lock %1 qword [A0]
2294 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2295 EPILOGUE_2_ARGS
2296ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2297 %endif ; RT_ARCH_AMD64
2298
2299%endmacro
2300
2301IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2303IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2304IEMIMPL_UNARY_OP not, 0, 0
2305
2306
2307;
2308; BSWAP. No flag changes.
2309;
2310; Each function takes one argument, pointer to the value to bswap
2311; (input/output). They all return void.
2312;
2313BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2314 PROLOGUE_1_ARGS
2315 mov T0_32, [A0] ; just in case any of the upper bits are used.
2316 db 66h
2317 bswap T0_32
2318 mov [A0], T0_32
2319 EPILOGUE_1_ARGS
2320ENDPROC iemAImpl_bswap_u16
2321
2322BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2323 PROLOGUE_1_ARGS
2324 mov T0_32, [A0]
2325 bswap T0_32
2326 mov [A0], T0_32
2327 EPILOGUE_1_ARGS
2328ENDPROC iemAImpl_bswap_u32
2329
2330BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2331%ifdef RT_ARCH_AMD64
2332 PROLOGUE_1_ARGS
2333 mov T0, [A0]
2334 bswap T0
2335 mov [A0], T0
2336 EPILOGUE_1_ARGS
2337%else
2338 PROLOGUE_1_ARGS
2339 mov T0, [A0]
2340 mov T1, [A0 + 4]
2341 bswap T0
2342 bswap T1
2343 mov [A0 + 4], T0
2344 mov [A0], T1
2345 EPILOGUE_1_ARGS
2346%endif
2347ENDPROC iemAImpl_bswap_u64
2348
2349
2350;;
2351; Macro for implementing a shift operation.
2352;
2353; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2354; 32-bit system where the 64-bit accesses requires hand coding.
2355;
2356; All the functions takes a pointer to the destination memory operand in A0,
2357; the shift count in A1 and a pointer to eflags in A2.
2358;
2359; @param 1 The instruction mnemonic.
2360; @param 2 The modified flags.
2361; @param 3 The undefined flags.
2362; @param 4 Force load flags.
2363;
2364; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2365; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2366; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2367; so we have to switch it around with the shift count parameter registers.
2368;
2369; @note the _intel and _amd variants are implemented in C.
2370;
2371%macro IEMIMPL_SHIFT_OP 4
2372BEGINCODE
2373BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2374 PROLOGUE_3_ARGS
2375 %ifdef ASM_CALL64_GCC
2376 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2377 mov cl, A2_8
2378 %1 byte [A1], cl
2379 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2380 %else
2381 xchg A2, A0
2382 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2383 %1 byte [A1], cl
2384 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2385 %endif
2386.zero_shift:
2387 EPILOGUE_3_ARGS
2388ENDPROC iemAImpl_ %+ %1 %+ _u8
2389
2390BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2391 PROLOGUE_3_ARGS
2392 %ifdef ASM_CALL64_GCC
2393 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2394 mov cl, A2_8
2395 %1 word [A1], cl
2396 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2397 %else
2398 xchg A2, A0
2399 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2400 %1 word [A1], cl
2401 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2402 %endif
2403 EPILOGUE_3_ARGS
2404ENDPROC iemAImpl_ %+ %1 %+ _u16
2405
2406BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2407 PROLOGUE_3_ARGS
2408 %ifdef ASM_CALL64_GCC
2409 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2410 mov cl, A2_8
2411 %1 dword [A1], cl
2412 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2413 %else
2414 xchg A2, A0
2415 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2416 %1 dword [A1], cl
2417 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2418 %endif
2419 EPILOGUE_3_ARGS
2420ENDPROC iemAImpl_ %+ %1 %+ _u32
2421
2422 %ifdef RT_ARCH_AMD64
2423BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2424 PROLOGUE_3_ARGS
2425 %ifdef ASM_CALL64_GCC
2426 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2427 mov cl, A2_8
2428 %1 qword [A1], cl
2429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2430 %else
2431 xchg A2, A0
2432 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2433 %1 qword [A1], cl
2434 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2435 %endif
2436 EPILOGUE_3_ARGS
2437ENDPROC iemAImpl_ %+ %1 %+ _u64
2438 %endif ; RT_ARCH_AMD64
2439
2440%endmacro
2441
2442; These instructions will NOT modify flags if the masked shift count is zero
2443; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2444; we have to force load all modified and undefined.
2445IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2449IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2452
2453
2454;;
2455; Macro for implementing a double precision shift operation.
2456;
2457; This will generate code for the 16, 32 and 64 bit accesses, except on
2458; 32-bit system where the 64-bit accesses requires hand coding.
2459;
2460; The functions takes the destination operand (r/m) in A0, the source (reg) in
2461; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2462;
2463; @param 1 The instruction mnemonic.
2464; @param 2 The modified flags.
2465; @param 3 The undefined flags.
2466; @param 4 The force loaded flags.
2467;
2468; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2469;
2470; @note the _intel and _amd variants are implemented in C.
2471;
2472%macro IEMIMPL_SHIFT_DBL_OP 4
2473BEGINCODE
2474BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2475 PROLOGUE_4_ARGS
2476 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2477 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2478 %ifdef ASM_CALL64_GCC
2479 xchg A3, A2
2480 %1 [A0], A1_16, cl
2481 xchg A3, A2
2482 %else
2483 xchg A0, A2
2484 %1 [A2], A1_16, cl
2485 %endif
2486 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2487 EPILOGUE_4_ARGS
2488ENDPROC iemAImpl_ %+ %1 %+ _u16
2489
2490BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2491 PROLOGUE_4_ARGS
2492 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2493 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2494 %ifdef ASM_CALL64_GCC
2495 xchg A3, A2
2496 %1 [A0], A1_32, cl
2497 xchg A3, A2
2498 %else
2499 xchg A0, A2
2500 %1 [A2], A1_32, cl
2501 %endif
2502 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2503 EPILOGUE_4_ARGS
2504ENDPROC iemAImpl_ %+ %1 %+ _u32
2505
2506 %ifdef RT_ARCH_AMD64
2507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2508 PROLOGUE_4_ARGS
2509 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2510 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2511 %ifdef ASM_CALL64_GCC
2512 xchg A3, A2
2513 %1 [A0], A1, cl
2514 xchg A3, A2
2515 %else
2516 xchg A0, A2
2517 %1 [A2], A1, cl
2518 %endif
2519 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2520 EPILOGUE_4_ARGS_EX 12
2521ENDPROC iemAImpl_ %+ %1 %+ _u64
2522 %endif ; RT_ARCH_AMD64
2523
2524%endmacro
2525
2526; These instructions will NOT modify flags if the masked shift count is zero
2527; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2528; we have to force load all modified and undefined.
2529IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2531
2532
2533;;
2534; Macro for implementing a multiplication operations.
2535;
2536; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2537; 32-bit system where the 64-bit accesses requires hand coding.
2538;
2539; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2540; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2541; incoming eflags in A3.
2542;
2543; The functions all return eflags. Since valid eflags can't ever be zero, we can
2544; use the same macros/tests framework as div/idiv.
2545;
2546; @param 1 The instruction mnemonic.
2547; @param 2 The modified flags.
2548; @param 3 The undefined flags.
2549; @param 4 Name suffix.
2550; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2551;
2552; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2553;
2554%macro IEMIMPL_MUL_OP 5
2555BEGINCODE
2556BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2557 PROLOGUE_3_ARGS
2558 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2559 mov al, [A0]
2560 %1 A1_8
2561 mov [A0], ax
2562 %if %5 != 1
2563 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2564 %else
2565 movzx edx, ax
2566 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A2_32, %2, X86_EFL_AF | X86_EFL_ZF, dx, 8, xDX ; intel
2567 %endif
2568 EPILOGUE_3_ARGS
2569ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2570
2571BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2572 PROLOGUE_4_ARGS
2573 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2574 mov ax, [A0]
2575 %ifdef ASM_CALL64_GCC
2576 %1 A2_16
2577 mov [A0], ax
2578 mov [A1], dx
2579 %else
2580 mov T1, A1
2581 %1 A2_16
2582 mov [A0], ax
2583 mov [T1], dx
2584 %endif
2585 %if %5 != 1
2586 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2587 %else
2588 movzx edx, ax
2589 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, dx, 16, xDX ; intel
2590 %endif
2591 EPILOGUE_4_ARGS
2592ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2593
2594BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2595 PROLOGUE_4_ARGS
2596 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2597 mov eax, [A0]
2598 %ifdef ASM_CALL64_GCC
2599 %1 A2_32
2600 mov [A0], eax
2601 mov [A1], edx
2602 %else
2603 mov T1, A1
2604 %1 A2_32
2605 mov [A0], eax
2606 mov [T1], edx
2607 %endif
2608 %if %5 != 1
2609 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2610 %else
2611 mov edx, eax
2612 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, edx, 32, xDX ; intel
2613 %endif
2614 EPILOGUE_4_ARGS
2615ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2616
2617 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2619 PROLOGUE_4_ARGS
2620 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2621 mov rax, [A0]
2622 %ifdef ASM_CALL64_GCC
2623 %1 A2
2624 mov [A0], rax
2625 mov [A1], rdx
2626 %else
2627 mov T1, A1
2628 %1 A2
2629 mov [A0], rax
2630 mov [T1], rdx
2631 %endif
2632 %if %5 != 1
2633 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2634 %else
2635 mov T2, rax
2636 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, T2, 64, T2 ; intel
2637 %endif
2638 EPILOGUE_4_ARGS_EX 12
2639ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2640 %endif ; !RT_ARCH_AMD64
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and
2691; incoming eflags in A3.
2692;
2693; The functions returns the new EFLAGS on success and zero on divide error.
2694; The new EFLAGS value can never be zero, given that bit 1 always set.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_RETVAL A2_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2761 %endif
2762.return:
2763 EPILOGUE_3_ARGS
2764
2765.div_zero:
2766.div_overflow:
2767 xor eax, eax
2768 jmp .return
2769ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2770
2771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2772 PROLOGUE_4_ARGS
2773
2774 ; div by chainsaw check.
2775 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2776 jz .div_zero
2777
2778 ; Overflow check - unsigned division is simple to verify, haven't
2779 ; found a simple way to check signed division yet unfortunately.
2780 %if %4 == 0
2781 cmp [A1], A2_16
2782 jae .div_overflow
2783 %else
2784 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2785 shl T0_32, 16
2786 mov T0_16, [A0] ; T0 = dividend
2787 mov T1, A2 ; T1 = divisor
2788 test T1_16, T1_16
2789 js .divisor_negative
2790 test T0_32, T0_32
2791 jns .both_positive
2792 neg T0_32
2793.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2794 push T0 ; Start off like unsigned below.
2795 shr T0_32, 15
2796 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2797 pop T0
2798 jb .div_no_overflow
2799 ja .div_overflow
2800 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2801 cmp T0_16, T1_16
2802 jae .div_overflow
2803 jmp .div_no_overflow
2804
2805.divisor_negative:
2806 neg T1_16
2807 test T0_32, T0_32
2808 jns .one_of_each
2809 neg T0_32
2810.both_positive: ; Same as unsigned shifted by sign indicator bit.
2811 shr T0_32, 15
2812 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2813 jae .div_overflow
2814.div_no_overflow:
2815 %endif
2816
2817 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2818 %ifdef ASM_CALL64_GCC
2819 mov T1, A2
2820 mov ax, [A0]
2821 mov dx, [A1]
2822 %1 T1_16
2823 mov [A0], ax
2824 mov [A1], dx
2825 %else
2826 mov T1, A1
2827 mov ax, [A0]
2828 mov dx, [T1]
2829 %1 A2_16
2830 mov [A0], ax
2831 mov [T1], dx
2832 %endif
2833 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2834 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2835 %else
2836 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2837 %endif
2838
2839.return:
2840 EPILOGUE_4_ARGS
2841
2842.div_zero:
2843.div_overflow:
2844 xor eax, eax
2845 jmp .return
2846ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2847
2848BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2849 PROLOGUE_4_ARGS
2850
2851 ; div by chainsaw check.
2852 test A2_32, A2_32
2853 jz .div_zero
2854
2855 ; Overflow check - unsigned division is simple to verify, haven't
2856 ; found a simple way to check signed division yet unfortunately.
2857 %if %4 == 0
2858 cmp [A1], A2_32
2859 jae .div_overflow
2860 %else
2861 push A2 ; save A2 so we modify it (we out of regs on x86).
2862 mov T0_32, [A0] ; T0 = dividend low
2863 mov T1_32, [A1] ; T1 = dividend high
2864 ;test A2_32, A2_32 - we did this 5 instructions ago.
2865 js .divisor_negative
2866 test T1_32, T1_32
2867 jns .both_positive
2868 call NAME(iemAImpl_negate_T0_T1_u32)
2869.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2870 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2871 jnz .div_overflow
2872 push T0 ; Start off like unsigned below.
2873 shl T1_32, 1
2874 shr T0_32, 31
2875 or T1_32, T0_32
2876 cmp T1_32, A2_32
2877 pop T0
2878 jb .div_no_overflow
2879 ja .div_overflow
2880 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2881 cmp T0_32, A2_32
2882 jae .div_overflow
2883 jmp .div_no_overflow
2884
2885.divisor_negative:
2886 neg A2_32
2887 test T1_32, T1_32
2888 jns .one_of_each
2889 call NAME(iemAImpl_negate_T0_T1_u32)
2890.both_positive: ; Same as unsigned shifted by sign indicator bit.
2891 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2892 jnz .div_overflow
2893 shl T1_32, 1
2894 shr T0_32, 31
2895 or T1_32, T0_32
2896 cmp T1_32, A2_32
2897 jae .div_overflow
2898.div_no_overflow:
2899 pop A2
2900 %endif
2901
2902 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2903 mov eax, [A0]
2904 %ifdef ASM_CALL64_GCC
2905 mov T1, A2
2906 mov eax, [A0]
2907 mov edx, [A1]
2908 %1 T1_32
2909 mov [A0], eax
2910 mov [A1], edx
2911 %else
2912 mov T1, A1
2913 mov eax, [A0]
2914 mov edx, [T1]
2915 %1 A2_32
2916 mov [A0], eax
2917 mov [T1], edx
2918 %endif
2919 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2920 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2921 %else
2922 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2923 %endif
2924
2925.return:
2926 EPILOGUE_4_ARGS
2927
2928.div_overflow:
2929 %if %4 != 0
2930 pop A2
2931 %endif
2932.div_zero:
2933 xor eax, eax
2934 jmp .return
2935ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2936
2937 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2938BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2939 PROLOGUE_4_ARGS
2940
2941 test A2, A2
2942 jz .div_zero
2943 %if %4 == 0
2944 cmp [A1], A2
2945 jae .div_overflow
2946 %else
2947 push A2 ; save A2 so we modify it (we out of regs on x86).
2948 mov T0, [A0] ; T0 = dividend low
2949 mov T1, [A1] ; T1 = dividend high
2950 ;test A2, A2 - we did this five instructions above.
2951 js .divisor_negative
2952 test T1, T1
2953 jns .both_positive
2954 call NAME(iemAImpl_negate_T0_T1_u64)
2955.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2956 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2957 jc .div_overflow
2958 push T0 ; Start off like unsigned below.
2959 shl T1, 1
2960 shr T0, 63
2961 or T1, T0
2962 cmp T1, A2
2963 pop T0
2964 jb .div_no_overflow
2965 ja .div_overflow
2966 mov T1, 0x7fffffffffffffff
2967 and T0, T1 ; Special case for covering (divisor - 1).
2968 cmp T0, A2
2969 jae .div_overflow
2970 jmp .div_no_overflow
2971
2972.divisor_negative:
2973 neg A2
2974 test T1, T1
2975 jns .one_of_each
2976 call NAME(iemAImpl_negate_T0_T1_u64)
2977.both_positive: ; Same as unsigned shifted by sign indicator bit.
2978 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2979 jc .div_overflow
2980 shl T1, 1
2981 shr T0, 63
2982 or T1, T0
2983 cmp T1, A2
2984 jae .div_overflow
2985.div_no_overflow:
2986 pop A2
2987 %endif
2988
2989 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2990 mov rax, [A0]
2991 %ifdef ASM_CALL64_GCC
2992 mov T1, A2
2993 mov rax, [A0]
2994 mov rdx, [A1]
2995 %1 T1
2996 mov [A0], rax
2997 mov [A1], rdx
2998 %else
2999 mov T1, A1
3000 mov rax, [A0]
3001 mov rdx, [T1]
3002 %1 A2
3003 mov [A0], rax
3004 mov [T1], rdx
3005 %endif
3006 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3007 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3008 %else
3009 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
3010 %endif
3011
3012.return:
3013 EPILOGUE_4_ARGS_EX 12
3014
3015.div_overflow:
3016 %if %4 != 0
3017 pop A2
3018 %endif
3019.div_zero:
3020 xor eax, eax
3021 jmp .return
3022ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3023 %endif ; !RT_ARCH_AMD64
3024
3025%endmacro
3026
3027IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3028IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3029IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3030;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3031IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3032IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3033IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3034
3035
3036;;
3037; Macro for implementing memory fence operation.
3038;
3039; No return value, no operands or anything.
3040;
3041; @param 1 The instruction.
3042;
3043%macro IEMIMPL_MEM_FENCE 1
3044BEGINCODE
3045BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3046 %1
3047 ret
3048ENDPROC iemAImpl_ %+ %1
3049%endmacro
3050
3051IEMIMPL_MEM_FENCE lfence
3052IEMIMPL_MEM_FENCE sfence
3053IEMIMPL_MEM_FENCE mfence
3054
3055;;
3056; Alternative for non-SSE2 host.
3057;
3058BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3059 push xAX
3060 xchg xAX, [xSP]
3061 add xSP, xCB
3062 ret
3063ENDPROC iemAImpl_alt_mem_fence
3064
3065
3066;;
3067; Initialize the FPU for the actual instruction being emulated, this means
3068; loading parts of the guest's control word and status word.
3069;
3070; @uses 24 bytes of stack. T0, T1
3071; @param 1 Expression giving the address of the FXSTATE of the guest.
3072;
3073%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3074 fnstenv [xSP]
3075
3076 ; FCW - for exception, precision and rounding control.
3077 movzx T0, word [%1 + X86FXSTATE.FCW]
3078 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3079 mov [xSP + X86FSTENV32P.FCW], T0_16
3080
3081 ; FSW - for undefined C0, C1, C2, and C3.
3082 movzx T1, word [%1 + X86FXSTATE.FSW]
3083 and T1, X86_FSW_C_MASK
3084 movzx T0, word [xSP + X86FSTENV32P.FSW]
3085 and T0, X86_FSW_TOP_MASK
3086 or T0, T1
3087 mov [xSP + X86FSTENV32P.FSW], T0_16
3088
3089 fldenv [xSP]
3090%endmacro
3091
3092
3093;;
3094; Initialize the FPU for the actual instruction being emulated, this means
3095; loading parts of the guest's control word, status word, and update the
3096; tag word for the top register if it's empty.
3097;
3098; ASSUMES actual TOP=7
3099;
3100; @uses 24 bytes of stack. T0, T1
3101; @param 1 Expression giving the address of the FXSTATE of the guest.
3102;
3103%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3104 fnstenv [xSP]
3105
3106 ; FCW - for exception, precision and rounding control.
3107 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3108 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3109 mov [xSP + X86FSTENV32P.FCW], T0_16
3110
3111 ; FSW - for undefined C0, C1, C2, and C3.
3112 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3113 and T1_32, X86_FSW_C_MASK
3114 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3115 and T0_32, X86_FSW_TOP_MASK
3116 or T0_32, T1_32
3117 mov [xSP + X86FSTENV32P.FSW], T0_16
3118
3119 ; FTW - Only for ST0 (in/out).
3120 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3121 shr T1_32, X86_FSW_TOP_SHIFT
3122 and T1_32, X86_FSW_TOP_SMASK
3123 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3124 jc %%st0_not_empty
3125 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3126%%st0_not_empty:
3127
3128 fldenv [xSP]
3129%endmacro
3130
3131
3132;;
3133; Need to move this as well somewhere better?
3134;
3135struc IEMFPURESULT
3136 .r80Result resw 5
3137 .FSW resw 1
3138endstruc
3139
3140
3141;;
3142; Need to move this as well somewhere better?
3143;
3144struc IEMFPURESULTTWO
3145 .r80Result1 resw 5
3146 .FSW resw 1
3147 .r80Result2 resw 5
3148endstruc
3149
3150
3151;
3152;---------------------- 16-bit signed integer operations ----------------------
3153;
3154
3155
3156;;
3157; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3158;
3159; @param A0 FPU context (fxsave).
3160; @param A1 Pointer to a IEMFPURESULT for the output.
3161; @param A2 Pointer to the 16-bit floating point value to convert.
3162;
3163BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3164 PROLOGUE_3_ARGS
3165 sub xSP, 20h
3166
3167 fninit
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 fild word [A2]
3170
3171 fnstsw word [A1 + IEMFPURESULT.FSW]
3172 fnclex
3173 fstp tword [A1 + IEMFPURESULT.r80Result]
3174
3175 fninit
3176 add xSP, 20h
3177 EPILOGUE_3_ARGS
3178ENDPROC iemAImpl_fild_r80_from_i16
3179
3180
3181;;
3182; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3183;
3184; @param A0 FPU context (fxsave).
3185; @param A1 Where to return the output FSW.
3186; @param A2 Where to store the 16-bit signed integer value.
3187; @param A3 Pointer to the 80-bit value.
3188;
3189BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3190 PROLOGUE_4_ARGS
3191 sub xSP, 20h
3192
3193 fninit
3194 fld tword [A3]
3195 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3196 fistp word [A2]
3197
3198 fnstsw word [A1]
3199
3200 fninit
3201 add xSP, 20h
3202 EPILOGUE_4_ARGS
3203ENDPROC iemAImpl_fist_r80_to_i16
3204
3205
3206;;
3207; Store a 80-bit floating point value (register) as a 16-bit signed integer
3208; (memory) with truncation.
3209;
3210; @param A0 FPU context (fxsave).
3211; @param A1 Where to return the output FSW.
3212; @param A2 Where to store the 16-bit signed integer value.
3213; @param A3 Pointer to the 80-bit value.
3214;
3215BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3216 PROLOGUE_4_ARGS
3217 sub xSP, 20h
3218
3219 fninit
3220 fld tword [A3]
3221 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3222 fisttp word [A2]
3223
3224 fnstsw word [A1]
3225
3226 fninit
3227 add xSP, 20h
3228 EPILOGUE_4_ARGS
3229ENDPROC iemAImpl_fistt_r80_to_i16
3230
3231
3232;;
3233; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3234;
3235; @param 1 The instruction
3236;
3237; @param A0 FPU context (fxsave).
3238; @param A1 Pointer to a IEMFPURESULT for the output.
3239; @param A2 Pointer to the 80-bit value.
3240; @param A3 Pointer to the 16-bit value.
3241;
3242%macro IEMIMPL_FPU_R80_BY_I16 1
3243BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3244 PROLOGUE_4_ARGS
3245 sub xSP, 20h
3246
3247 fninit
3248 fld tword [A2]
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 %1 word [A3]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_4_ARGS
3259ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3260%endmacro
3261
3262IEMIMPL_FPU_R80_BY_I16 fiadd
3263IEMIMPL_FPU_R80_BY_I16 fimul
3264IEMIMPL_FPU_R80_BY_I16 fisub
3265IEMIMPL_FPU_R80_BY_I16 fisubr
3266IEMIMPL_FPU_R80_BY_I16 fidiv
3267IEMIMPL_FPU_R80_BY_I16 fidivr
3268
3269
3270;;
3271; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3272; only returning FSW.
3273;
3274; @param 1 The instruction
3275;
3276; @param A0 FPU context (fxsave).
3277; @param A1 Where to store the output FSW.
3278; @param A2 Pointer to the 80-bit value.
3279; @param A3 Pointer to the 64-bit value.
3280;
3281%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3283 PROLOGUE_4_ARGS
3284 sub xSP, 20h
3285
3286 fninit
3287 fld tword [A2]
3288 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3289 %1 word [A3]
3290
3291 fnstsw word [A1]
3292
3293 fninit
3294 add xSP, 20h
3295 EPILOGUE_4_ARGS
3296ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3297%endmacro
3298
3299IEMIMPL_FPU_R80_BY_I16_FSW ficom
3300
3301
3302
3303;
3304;---------------------- 32-bit signed integer operations ----------------------
3305;
3306
3307
3308;;
3309; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3310;
3311; @param A0 FPU context (fxsave).
3312; @param A1 Pointer to a IEMFPURESULT for the output.
3313; @param A2 Pointer to the 32-bit floating point value to convert.
3314;
3315BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3316 PROLOGUE_3_ARGS
3317 sub xSP, 20h
3318
3319 fninit
3320 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3321 fild dword [A2]
3322
3323 fnstsw word [A1 + IEMFPURESULT.FSW]
3324 fnclex
3325 fstp tword [A1 + IEMFPURESULT.r80Result]
3326
3327 fninit
3328 add xSP, 20h
3329 EPILOGUE_3_ARGS
3330ENDPROC iemAImpl_fild_r80_from_i32
3331
3332
3333;;
3334; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3335;
3336; @param A0 FPU context (fxsave).
3337; @param A1 Where to return the output FSW.
3338; @param A2 Where to store the 32-bit signed integer value.
3339; @param A3 Pointer to the 80-bit value.
3340;
3341BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A3]
3347 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3348 fistp dword [A2]
3349
3350 fnstsw word [A1]
3351
3352 fninit
3353 add xSP, 20h
3354 EPILOGUE_4_ARGS
3355ENDPROC iemAImpl_fist_r80_to_i32
3356
3357
3358;;
3359; Store a 80-bit floating point value (register) as a 32-bit signed integer
3360; (memory) with truncation.
3361;
3362; @param A0 FPU context (fxsave).
3363; @param A1 Where to return the output FSW.
3364; @param A2 Where to store the 32-bit signed integer value.
3365; @param A3 Pointer to the 80-bit value.
3366;
3367BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3368 PROLOGUE_4_ARGS
3369 sub xSP, 20h
3370
3371 fninit
3372 fld tword [A3]
3373 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3374 fisttp dword [A2]
3375
3376 fnstsw word [A1]
3377
3378 fninit
3379 add xSP, 20h
3380 EPILOGUE_4_ARGS
3381ENDPROC iemAImpl_fistt_r80_to_i32
3382
3383
3384;;
3385; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3386;
3387; @param 1 The instruction
3388;
3389; @param A0 FPU context (fxsave).
3390; @param A1 Pointer to a IEMFPURESULT for the output.
3391; @param A2 Pointer to the 80-bit value.
3392; @param A3 Pointer to the 32-bit value.
3393;
3394%macro IEMIMPL_FPU_R80_BY_I32 1
3395BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3396 PROLOGUE_4_ARGS
3397 sub xSP, 20h
3398
3399 fninit
3400 fld tword [A2]
3401 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3402 %1 dword [A3]
3403
3404 fnstsw word [A1 + IEMFPURESULT.FSW]
3405 fnclex
3406 fstp tword [A1 + IEMFPURESULT.r80Result]
3407
3408 fninit
3409 add xSP, 20h
3410 EPILOGUE_4_ARGS
3411ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3412%endmacro
3413
3414IEMIMPL_FPU_R80_BY_I32 fiadd
3415IEMIMPL_FPU_R80_BY_I32 fimul
3416IEMIMPL_FPU_R80_BY_I32 fisub
3417IEMIMPL_FPU_R80_BY_I32 fisubr
3418IEMIMPL_FPU_R80_BY_I32 fidiv
3419IEMIMPL_FPU_R80_BY_I32 fidivr
3420
3421
3422;;
3423; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3424; only returning FSW.
3425;
3426; @param 1 The instruction
3427;
3428; @param A0 FPU context (fxsave).
3429; @param A1 Where to store the output FSW.
3430; @param A2 Pointer to the 80-bit value.
3431; @param A3 Pointer to the 64-bit value.
3432;
3433%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3434BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3435 PROLOGUE_4_ARGS
3436 sub xSP, 20h
3437
3438 fninit
3439 fld tword [A2]
3440 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3441 %1 dword [A3]
3442
3443 fnstsw word [A1]
3444
3445 fninit
3446 add xSP, 20h
3447 EPILOGUE_4_ARGS
3448ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3449%endmacro
3450
3451IEMIMPL_FPU_R80_BY_I32_FSW ficom
3452
3453
3454
3455;
3456;---------------------- 64-bit signed integer operations ----------------------
3457;
3458
3459
3460;;
3461; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3462;
3463; @param A0 FPU context (fxsave).
3464; @param A1 Pointer to a IEMFPURESULT for the output.
3465; @param A2 Pointer to the 64-bit floating point value to convert.
3466;
3467BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3468 PROLOGUE_3_ARGS
3469 sub xSP, 20h
3470
3471 fninit
3472 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3473 fild qword [A2]
3474
3475 fnstsw word [A1 + IEMFPURESULT.FSW]
3476 fnclex
3477 fstp tword [A1 + IEMFPURESULT.r80Result]
3478
3479 fninit
3480 add xSP, 20h
3481 EPILOGUE_3_ARGS
3482ENDPROC iemAImpl_fild_r80_from_i64
3483
3484
3485;;
3486; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Where to return the output FSW.
3490; @param A2 Where to store the 64-bit signed integer value.
3491; @param A3 Pointer to the 80-bit value.
3492;
3493BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3494 PROLOGUE_4_ARGS
3495 sub xSP, 20h
3496
3497 fninit
3498 fld tword [A3]
3499 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3500 fistp qword [A2]
3501
3502 fnstsw word [A1]
3503
3504 fninit
3505 add xSP, 20h
3506 EPILOGUE_4_ARGS
3507ENDPROC iemAImpl_fist_r80_to_i64
3508
3509
3510;;
3511; Store a 80-bit floating point value (register) as a 64-bit signed integer
3512; (memory) with truncation.
3513;
3514; @param A0 FPU context (fxsave).
3515; @param A1 Where to return the output FSW.
3516; @param A2 Where to store the 64-bit signed integer value.
3517; @param A3 Pointer to the 80-bit value.
3518;
3519BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3520 PROLOGUE_4_ARGS
3521 sub xSP, 20h
3522
3523 fninit
3524 fld tword [A3]
3525 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3526 fisttp qword [A2]
3527
3528 fnstsw word [A1]
3529
3530 fninit
3531 add xSP, 20h
3532 EPILOGUE_4_ARGS
3533ENDPROC iemAImpl_fistt_r80_to_i64
3534
3535
3536
3537;
3538;---------------------- 32-bit floating point operations ----------------------
3539;
3540
3541;;
3542; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3543;
3544; @param A0 FPU context (fxsave).
3545; @param A1 Pointer to a IEMFPURESULT for the output.
3546; @param A2 Pointer to the 32-bit floating point value to convert.
3547;
3548BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3549 PROLOGUE_3_ARGS
3550 sub xSP, 20h
3551
3552 fninit
3553 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3554 fld dword [A2]
3555
3556 fnstsw word [A1 + IEMFPURESULT.FSW]
3557 fnclex
3558 fstp tword [A1 + IEMFPURESULT.r80Result]
3559
3560 fninit
3561 add xSP, 20h
3562 EPILOGUE_3_ARGS
3563ENDPROC iemAImpl_fld_r80_from_r32
3564
3565
3566;;
3567; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3568;
3569; @param A0 FPU context (fxsave).
3570; @param A1 Where to return the output FSW.
3571; @param A2 Where to store the 32-bit value.
3572; @param A3 Pointer to the 80-bit value.
3573;
3574BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3575 PROLOGUE_4_ARGS
3576 sub xSP, 20h
3577
3578 fninit
3579 fld tword [A3]
3580 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3581 fst dword [A2]
3582
3583 fnstsw word [A1]
3584
3585 fninit
3586 add xSP, 20h
3587 EPILOGUE_4_ARGS
3588ENDPROC iemAImpl_fst_r80_to_r32
3589
3590
3591;;
3592; FPU instruction working on one 80-bit and one 32-bit floating point value.
3593;
3594; @param 1 The instruction
3595;
3596; @param A0 FPU context (fxsave).
3597; @param A1 Pointer to a IEMFPURESULT for the output.
3598; @param A2 Pointer to the 80-bit value.
3599; @param A3 Pointer to the 32-bit value.
3600;
3601%macro IEMIMPL_FPU_R80_BY_R32 1
3602BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3603 PROLOGUE_4_ARGS
3604 sub xSP, 20h
3605
3606 fninit
3607 fld tword [A2]
3608 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3609 %1 dword [A3]
3610
3611 fnstsw word [A1 + IEMFPURESULT.FSW]
3612 fnclex
3613 fstp tword [A1 + IEMFPURESULT.r80Result]
3614
3615 fninit
3616 add xSP, 20h
3617 EPILOGUE_4_ARGS
3618ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3619%endmacro
3620
3621IEMIMPL_FPU_R80_BY_R32 fadd
3622IEMIMPL_FPU_R80_BY_R32 fmul
3623IEMIMPL_FPU_R80_BY_R32 fsub
3624IEMIMPL_FPU_R80_BY_R32 fsubr
3625IEMIMPL_FPU_R80_BY_R32 fdiv
3626IEMIMPL_FPU_R80_BY_R32 fdivr
3627
3628
3629;;
3630; FPU instruction working on one 80-bit and one 32-bit floating point value,
3631; only returning FSW.
3632;
3633; @param 1 The instruction
3634;
3635; @param A0 FPU context (fxsave).
3636; @param A1 Where to store the output FSW.
3637; @param A2 Pointer to the 80-bit value.
3638; @param A3 Pointer to the 64-bit value.
3639;
3640%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3641BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3642 PROLOGUE_4_ARGS
3643 sub xSP, 20h
3644
3645 fninit
3646 fld tword [A2]
3647 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3648 %1 dword [A3]
3649
3650 fnstsw word [A1]
3651
3652 fninit
3653 add xSP, 20h
3654 EPILOGUE_4_ARGS
3655ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3656%endmacro
3657
3658IEMIMPL_FPU_R80_BY_R32_FSW fcom
3659
3660
3661
3662;
3663;---------------------- 64-bit floating point operations ----------------------
3664;
3665
3666;;
3667; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3668;
3669; @param A0 FPU context (fxsave).
3670; @param A1 Pointer to a IEMFPURESULT for the output.
3671; @param A2 Pointer to the 64-bit floating point value to convert.
3672;
3673BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3674 PROLOGUE_3_ARGS
3675 sub xSP, 20h
3676
3677 fninit
3678 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3679 fld qword [A2]
3680
3681 fnstsw word [A1 + IEMFPURESULT.FSW]
3682 fnclex
3683 fstp tword [A1 + IEMFPURESULT.r80Result]
3684
3685 fninit
3686 add xSP, 20h
3687 EPILOGUE_3_ARGS
3688ENDPROC iemAImpl_fld_r80_from_r64
3689
3690
3691;;
3692; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3693;
3694; @param A0 FPU context (fxsave).
3695; @param A1 Where to return the output FSW.
3696; @param A2 Where to store the 64-bit value.
3697; @param A3 Pointer to the 80-bit value.
3698;
3699BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3700 PROLOGUE_4_ARGS
3701 sub xSP, 20h
3702
3703 fninit
3704 fld tword [A3]
3705 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3706 fst qword [A2]
3707
3708 fnstsw word [A1]
3709
3710 fninit
3711 add xSP, 20h
3712 EPILOGUE_4_ARGS
3713ENDPROC iemAImpl_fst_r80_to_r64
3714
3715
3716;;
3717; FPU instruction working on one 80-bit and one 64-bit floating point value.
3718;
3719; @param 1 The instruction
3720;
3721; @param A0 FPU context (fxsave).
3722; @param A1 Pointer to a IEMFPURESULT for the output.
3723; @param A2 Pointer to the 80-bit value.
3724; @param A3 Pointer to the 64-bit value.
3725;
3726%macro IEMIMPL_FPU_R80_BY_R64 1
3727BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3728 PROLOGUE_4_ARGS
3729 sub xSP, 20h
3730
3731 fninit
3732 fld tword [A2]
3733 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3734 %1 qword [A3]
3735
3736 fnstsw word [A1 + IEMFPURESULT.FSW]
3737 fnclex
3738 fstp tword [A1 + IEMFPURESULT.r80Result]
3739
3740 fninit
3741 add xSP, 20h
3742 EPILOGUE_4_ARGS
3743ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3744%endmacro
3745
3746IEMIMPL_FPU_R80_BY_R64 fadd
3747IEMIMPL_FPU_R80_BY_R64 fmul
3748IEMIMPL_FPU_R80_BY_R64 fsub
3749IEMIMPL_FPU_R80_BY_R64 fsubr
3750IEMIMPL_FPU_R80_BY_R64 fdiv
3751IEMIMPL_FPU_R80_BY_R64 fdivr
3752
3753;;
3754; FPU instruction working on one 80-bit and one 64-bit floating point value,
3755; only returning FSW.
3756;
3757; @param 1 The instruction
3758;
3759; @param A0 FPU context (fxsave).
3760; @param A1 Where to store the output FSW.
3761; @param A2 Pointer to the 80-bit value.
3762; @param A3 Pointer to the 64-bit value.
3763;
3764%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3765BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3766 PROLOGUE_4_ARGS
3767 sub xSP, 20h
3768
3769 fninit
3770 fld tword [A2]
3771 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3772 %1 qword [A3]
3773
3774 fnstsw word [A1]
3775
3776 fninit
3777 add xSP, 20h
3778 EPILOGUE_4_ARGS
3779ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3780%endmacro
3781
3782IEMIMPL_FPU_R80_BY_R64_FSW fcom
3783
3784
3785
3786;
3787;---------------------- 80-bit floating point operations ----------------------
3788;
3789
3790;;
3791; Loads a 80-bit floating point register value from memory.
3792;
3793; @param A0 FPU context (fxsave).
3794; @param A1 Pointer to a IEMFPURESULT for the output.
3795; @param A2 Pointer to the 80-bit floating point value to load.
3796;
3797BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3798 PROLOGUE_3_ARGS
3799 sub xSP, 20h
3800
3801 fninit
3802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3803 fld tword [A2]
3804
3805 fnstsw word [A1 + IEMFPURESULT.FSW]
3806 fnclex
3807 fstp tword [A1 + IEMFPURESULT.r80Result]
3808
3809 fninit
3810 add xSP, 20h
3811 EPILOGUE_3_ARGS
3812ENDPROC iemAImpl_fld_r80_from_r80
3813
3814
3815;;
3816; Store a 80-bit floating point register to memory
3817;
3818; @param A0 FPU context (fxsave).
3819; @param A1 Where to return the output FSW.
3820; @param A2 Where to store the 80-bit value.
3821; @param A3 Pointer to the 80-bit register value.
3822;
3823BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3824 PROLOGUE_4_ARGS
3825 sub xSP, 20h
3826
3827 fninit
3828 fld tword [A3]
3829 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3830 fstp tword [A2]
3831
3832 fnstsw word [A1]
3833
3834 fninit
3835 add xSP, 20h
3836 EPILOGUE_4_ARGS
3837ENDPROC iemAImpl_fst_r80_to_r80
3838
3839
3840;;
3841; Loads an 80-bit floating point register value in BCD format from memory.
3842;
3843; @param A0 FPU context (fxsave).
3844; @param A1 Pointer to a IEMFPURESULT for the output.
3845; @param A2 Pointer to the 80-bit BCD value to load.
3846;
3847BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3848 PROLOGUE_3_ARGS
3849 sub xSP, 20h
3850
3851 fninit
3852 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3853 fbld tword [A2]
3854
3855 fnstsw word [A1 + IEMFPURESULT.FSW]
3856 fnclex
3857 fstp tword [A1 + IEMFPURESULT.r80Result]
3858
3859 fninit
3860 add xSP, 20h
3861 EPILOGUE_3_ARGS
3862ENDPROC iemAImpl_fld_r80_from_d80
3863
3864
3865;;
3866; Store a 80-bit floating point register to memory as BCD
3867;
3868; @param A0 FPU context (fxsave).
3869; @param A1 Where to return the output FSW.
3870; @param A2 Where to store the 80-bit BCD value.
3871; @param A3 Pointer to the 80-bit register value.
3872;
3873BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3874 PROLOGUE_4_ARGS
3875 sub xSP, 20h
3876
3877 fninit
3878 fld tword [A3]
3879 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3880 fbstp tword [A2]
3881
3882 fnstsw word [A1]
3883
3884 fninit
3885 add xSP, 20h
3886 EPILOGUE_4_ARGS
3887ENDPROC iemAImpl_fst_r80_to_d80
3888
3889
3890;;
3891; FPU instruction working on two 80-bit floating point values.
3892;
3893; @param 1 The instruction
3894;
3895; @param A0 FPU context (fxsave).
3896; @param A1 Pointer to a IEMFPURESULT for the output.
3897; @param A2 Pointer to the first 80-bit value (ST0)
3898; @param A3 Pointer to the second 80-bit value (STn).
3899;
3900%macro IEMIMPL_FPU_R80_BY_R80 2
3901BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3902 PROLOGUE_4_ARGS
3903 sub xSP, 20h
3904
3905 fninit
3906 fld tword [A3]
3907 fld tword [A2]
3908 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3909 %1 %2
3910
3911 fnstsw word [A1 + IEMFPURESULT.FSW]
3912 fnclex
3913 fstp tword [A1 + IEMFPURESULT.r80Result]
3914
3915 fninit
3916 add xSP, 20h
3917 EPILOGUE_4_ARGS
3918ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3919%endmacro
3920
3921IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3922IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3923IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3924IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3925IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3926IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fprem, {}
3928IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3929IEMIMPL_FPU_R80_BY_R80 fscale, {}
3930
3931
3932;;
3933; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3934; storing the result in ST1 and popping the stack.
3935;
3936; @param 1 The instruction
3937;
3938; @param A0 FPU context (fxsave).
3939; @param A1 Pointer to a IEMFPURESULT for the output.
3940; @param A2 Pointer to the first 80-bit value (ST1).
3941; @param A3 Pointer to the second 80-bit value (ST0).
3942;
3943%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3944BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3945 PROLOGUE_4_ARGS
3946 sub xSP, 20h
3947
3948 fninit
3949 fld tword [A2]
3950 fld tword [A3]
3951 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3952 %1
3953
3954 fnstsw word [A1 + IEMFPURESULT.FSW]
3955 fnclex
3956 fstp tword [A1 + IEMFPURESULT.r80Result]
3957
3958 fninit
3959 add xSP, 20h
3960 EPILOGUE_4_ARGS
3961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3962%endmacro
3963
3964IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3965IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3966IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3967
3968
3969;;
3970; FPU instruction working on two 80-bit floating point values, only
3971; returning FSW.
3972;
3973; @param 1 The instruction
3974;
3975; @param A0 FPU context (fxsave).
3976; @param A1 Pointer to a uint16_t for the resulting FSW.
3977; @param A2 Pointer to the first 80-bit value.
3978; @param A3 Pointer to the second 80-bit value.
3979;
3980%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3982 PROLOGUE_4_ARGS
3983 sub xSP, 20h
3984
3985 fninit
3986 fld tword [A3]
3987 fld tword [A2]
3988 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3989 %1 st0, st1
3990
3991 fnstsw word [A1]
3992
3993 fninit
3994 add xSP, 20h
3995 EPILOGUE_4_ARGS
3996ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3997%endmacro
3998
3999IEMIMPL_FPU_R80_BY_R80_FSW fcom
4000IEMIMPL_FPU_R80_BY_R80_FSW fucom
4001
4002
4003;;
4004; FPU instruction working on two 80-bit floating point values,
4005; returning FSW and EFLAGS (eax).
4006;
4007; @param 1 The instruction
4008;
4009; @returns EFLAGS in EAX.
4010; @param A0 FPU context (fxsave).
4011; @param A1 Pointer to a uint16_t for the resulting FSW.
4012; @param A2 Pointer to the first 80-bit value.
4013; @param A3 Pointer to the second 80-bit value.
4014;
4015%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4017 PROLOGUE_4_ARGS
4018 sub xSP, 20h
4019
4020 fninit
4021 fld tword [A3]
4022 fld tword [A2]
4023 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4024 %1 st1
4025
4026 fnstsw word [A1]
4027 pushf
4028 pop xAX
4029
4030 fninit
4031 add xSP, 20h
4032 EPILOGUE_4_ARGS
4033ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4034%endmacro
4035
4036IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4037IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4038
4039
4040;;
4041; FPU instruction working on one 80-bit floating point value.
4042;
4043; @param 1 The instruction
4044;
4045; @param A0 FPU context (fxsave).
4046; @param A1 Pointer to a IEMFPURESULT for the output.
4047; @param A2 Pointer to the 80-bit value.
4048;
4049%macro IEMIMPL_FPU_R80 1
4050BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4051 PROLOGUE_3_ARGS
4052 sub xSP, 20h
4053
4054 fninit
4055 fld tword [A2]
4056 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4057 %1
4058
4059 fnstsw word [A1 + IEMFPURESULT.FSW]
4060 fnclex
4061 fstp tword [A1 + IEMFPURESULT.r80Result]
4062
4063 fninit
4064 add xSP, 20h
4065 EPILOGUE_3_ARGS
4066ENDPROC iemAImpl_ %+ %1 %+ _r80
4067%endmacro
4068
4069IEMIMPL_FPU_R80 fchs
4070IEMIMPL_FPU_R80 fabs
4071IEMIMPL_FPU_R80 f2xm1
4072IEMIMPL_FPU_R80 fsqrt
4073IEMIMPL_FPU_R80 frndint
4074IEMIMPL_FPU_R80 fsin
4075IEMIMPL_FPU_R80 fcos
4076
4077
4078;;
4079; FPU instruction working on one 80-bit floating point value, only
4080; returning FSW.
4081;
4082; @param 1 The instruction
4083; @param 2 Non-zero to also restore FTW.
4084;
4085; @param A0 FPU context (fxsave).
4086; @param A1 Pointer to a uint16_t for the resulting FSW.
4087; @param A2 Pointer to the 80-bit value.
4088;
4089%macro IEMIMPL_FPU_R80_FSW 2
4090BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4091 PROLOGUE_3_ARGS
4092 sub xSP, 20h
4093
4094 fninit
4095 fld tword [A2]
4096%if %2 != 0
4097 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4098%else
4099 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4100%endif
4101 %1
4102
4103 fnstsw word [A1]
4104
4105 fninit
4106 add xSP, 20h
4107 EPILOGUE_3_ARGS
4108ENDPROC iemAImpl_ %+ %1 %+ _r80
4109%endmacro
4110
4111IEMIMPL_FPU_R80_FSW ftst, 0
4112IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4113
4114
4115
4116;;
4117; FPU instruction loading a 80-bit floating point constant.
4118;
4119; @param 1 The instruction
4120;
4121; @param A0 FPU context (fxsave).
4122; @param A1 Pointer to a IEMFPURESULT for the output.
4123;
4124%macro IEMIMPL_FPU_R80_CONST 1
4125BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4126 PROLOGUE_2_ARGS
4127 sub xSP, 20h
4128
4129 fninit
4130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4131 %1
4132
4133 fnstsw word [A1 + IEMFPURESULT.FSW]
4134 fnclex
4135 fstp tword [A1 + IEMFPURESULT.r80Result]
4136
4137 fninit
4138 add xSP, 20h
4139 EPILOGUE_2_ARGS
4140ENDPROC iemAImpl_ %+ %1 %+
4141%endmacro
4142
4143IEMIMPL_FPU_R80_CONST fld1
4144IEMIMPL_FPU_R80_CONST fldl2t
4145IEMIMPL_FPU_R80_CONST fldl2e
4146IEMIMPL_FPU_R80_CONST fldpi
4147IEMIMPL_FPU_R80_CONST fldlg2
4148IEMIMPL_FPU_R80_CONST fldln2
4149IEMIMPL_FPU_R80_CONST fldz
4150
4151
4152;;
4153; FPU instruction working on one 80-bit floating point value, outputing two.
4154;
4155; @param 1 The instruction
4156;
4157; @param A0 FPU context (fxsave).
4158; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4159; @param A2 Pointer to the 80-bit value.
4160;
4161%macro IEMIMPL_FPU_R80_R80 1
4162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4163 PROLOGUE_3_ARGS
4164 sub xSP, 20h
4165
4166 fninit
4167 fld tword [A2]
4168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4169 %1
4170
4171 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4172 fnclex
4173 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4174 fnclex
4175 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4176
4177 fninit
4178 add xSP, 20h
4179 EPILOGUE_3_ARGS
4180ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4181%endmacro
4182
4183IEMIMPL_FPU_R80_R80 fptan
4184IEMIMPL_FPU_R80_R80 fxtract
4185IEMIMPL_FPU_R80_R80 fsincos
4186
4187
4188
4189
4190;---------------------- SSE and MMX Operations ----------------------
4191
4192;; @todo what do we need to do for MMX?
4193%macro IEMIMPL_MMX_PROLOGUE 0
4194%endmacro
4195%macro IEMIMPL_MMX_EPILOGUE 0
4196%endmacro
4197
4198;; @todo what do we need to do for SSE?
4199%macro IEMIMPL_SSE_PROLOGUE 0
4200%endmacro
4201%macro IEMIMPL_SSE_EPILOGUE 0
4202%endmacro
4203
4204;; @todo what do we need to do for AVX?
4205%macro IEMIMPL_AVX_PROLOGUE 0
4206%endmacro
4207%macro IEMIMPL_AVX_EPILOGUE 0
4208%endmacro
4209
4210
4211;;
4212; Media instruction working on two full sized registers.
4213;
4214; @param 1 The instruction
4215; @param 2 Whether there is an MMX variant (1) or not (0).
4216;
4217; @param A0 FPU context (fxsave).
4218; @param A1 Pointer to the first media register size operand (input/output).
4219; @param A2 Pointer to the second media register size operand (input).
4220;
4221; @todo r=aeichner Currently unused, can probably be removed.
4222;
4223%macro IEMIMPL_MEDIA_F2 2
4224%if %2 != 0
4225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4226 PROLOGUE_3_ARGS
4227 IEMIMPL_MMX_PROLOGUE
4228
4229 movq mm0, [A1]
4230 movq mm1, [A2]
4231 %1 mm0, mm1
4232 movq [A1], mm0
4233
4234 IEMIMPL_MMX_EPILOGUE
4235 EPILOGUE_3_ARGS
4236ENDPROC iemAImpl_ %+ %1 %+ _u64
4237%endif
4238
4239BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4240 PROLOGUE_3_ARGS
4241 IEMIMPL_SSE_PROLOGUE
4242
4243 movdqu xmm0, [A1]
4244 movdqu xmm1, [A2]
4245 %1 xmm0, xmm1
4246 movdqu [A1], xmm0
4247
4248 IEMIMPL_SSE_EPILOGUE
4249 EPILOGUE_3_ARGS
4250ENDPROC iemAImpl_ %+ %1 %+ _u128
4251%endmacro
4252
4253;;
4254; Media instruction working on two full sized registers, but no FXSAVE state argument.
4255;
4256; @param 1 The instruction
4257; @param 2 Whether there is an MMX variant (1) or not (0).
4258;
4259; @param A0 Pointer to the first media register size operand (input/output).
4260; @param A1 Pointer to the second media register size operand (input).
4261;
4262%macro IEMIMPL_MEDIA_OPT_F2 2
4263%if %2 != 0
4264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4265 PROLOGUE_2_ARGS
4266 IEMIMPL_MMX_PROLOGUE
4267
4268 movq mm0, [A0]
4269 movq mm1, [A1]
4270 %1 mm0, mm1
4271 movq [A0], mm0
4272
4273 IEMIMPL_MMX_EPILOGUE
4274 EPILOGUE_2_ARGS
4275ENDPROC iemAImpl_ %+ %1 %+ _u64
4276%endif
4277
4278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4279 PROLOGUE_2_ARGS
4280 IEMIMPL_SSE_PROLOGUE
4281
4282 movdqu xmm0, [A0]
4283 movdqu xmm1, [A1]
4284 %1 xmm0, xmm1
4285 movdqu [A0], xmm0
4286
4287 IEMIMPL_SSE_EPILOGUE
4288 EPILOGUE_2_ARGS
4289ENDPROC iemAImpl_ %+ %1 %+ _u128
4290%endmacro
4291
4292IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4293IEMIMPL_MEDIA_OPT_F2 pand, 1
4294IEMIMPL_MEDIA_OPT_F2 pandn, 1
4295IEMIMPL_MEDIA_OPT_F2 por, 1
4296IEMIMPL_MEDIA_OPT_F2 pxor, 1
4297IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4298IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4299IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4300IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4301IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4305IEMIMPL_MEDIA_OPT_F2 paddb, 1
4306IEMIMPL_MEDIA_OPT_F2 paddw, 1
4307IEMIMPL_MEDIA_OPT_F2 paddd, 1
4308IEMIMPL_MEDIA_OPT_F2 paddq, 1
4309IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4310IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4311IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4312IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4313IEMIMPL_MEDIA_OPT_F2 psubb, 1
4314IEMIMPL_MEDIA_OPT_F2 psubw, 1
4315IEMIMPL_MEDIA_OPT_F2 psubd, 1
4316IEMIMPL_MEDIA_OPT_F2 psubq, 1
4317IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4318IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4319IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4320IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4321IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4322IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4323IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4324IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4325IEMIMPL_MEDIA_OPT_F2 pminub, 1
4326IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4327IEMIMPL_MEDIA_OPT_F2 pminud, 0
4328IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4329IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4330IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4331IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4332IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4333IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4334IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4335IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4336IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4337IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4338IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4339IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4340IEMIMPL_MEDIA_OPT_F2 psignb, 1
4341IEMIMPL_MEDIA_OPT_F2 psignw, 1
4342IEMIMPL_MEDIA_OPT_F2 psignd, 1
4343IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4344IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4345IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4346IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4347IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4348IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4349IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4350IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4351IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4352IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4353IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4354IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4355IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4356IEMIMPL_MEDIA_OPT_F2 psllw, 1
4357IEMIMPL_MEDIA_OPT_F2 pslld, 1
4358IEMIMPL_MEDIA_OPT_F2 psllq, 1
4359IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4360IEMIMPL_MEDIA_OPT_F2 psrld, 1
4361IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4362IEMIMPL_MEDIA_OPT_F2 psraw, 1
4363IEMIMPL_MEDIA_OPT_F2 psrad, 1
4364IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4365IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4366IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4367IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4368IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4369IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4370IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4371IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4372IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4373IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4374IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4375IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4376IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4377IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4378IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4379IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4380IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4381IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4382IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4383IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4384
4385
4386;;
4387; Media instruction working on one full sized and one half sized register (lower half).
4388;
4389; @param 1 The instruction
4390; @param 2 1 if MMX is included, 0 if not.
4391;
4392; @param A0 Pointer to the first full sized media register operand (input/output).
4393; @param A1 Pointer to the second half sized media register operand (input).
4394;
4395%macro IEMIMPL_MEDIA_F1L1 2
4396 %if %2 != 0
4397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4398 PROLOGUE_2_ARGS
4399 IEMIMPL_MMX_PROLOGUE
4400
4401 movq mm0, [A0]
4402 movq mm1, [A1]
4403 %1 mm0, mm1
4404 movq [A0], mm0
4405
4406 IEMIMPL_MMX_EPILOGUE
4407 EPILOGUE_2_ARGS
4408ENDPROC iemAImpl_ %+ %1 %+ _u64
4409 %endif
4410
4411BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4412 PROLOGUE_2_ARGS
4413 IEMIMPL_SSE_PROLOGUE
4414
4415 movdqu xmm0, [A0]
4416 movdqu xmm1, [A1]
4417 %1 xmm0, xmm1
4418 movdqu [A0], xmm0
4419
4420 IEMIMPL_SSE_EPILOGUE
4421 EPILOGUE_2_ARGS
4422ENDPROC iemAImpl_ %+ %1 %+ _u128
4423%endmacro
4424
4425IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4426IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4427IEMIMPL_MEDIA_F1L1 punpckldq, 1
4428IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4429
4430
4431;;
4432; Media instruction working two half sized input registers (lower half) and a full sized
4433; destination register (vpunpckh*).
4434;
4435; @param 1 The instruction
4436;
4437; @param A0 Pointer to the destination register (full sized, output only).
4438; @param A1 Pointer to the first full sized media source register operand, where we
4439; will only use the lower half as input - but we'll be loading it in full.
4440; @param A2 Pointer to the second full sized media source register operand, where we
4441; will only use the lower half as input - but we'll be loading it in full.
4442;
4443%macro IEMIMPL_MEDIA_F1L1L1 1
4444BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4445 PROLOGUE_3_ARGS
4446 IEMIMPL_AVX_PROLOGUE
4447
4448 vmovdqu xmm0, [A1]
4449 vmovdqu xmm1, [A2]
4450 %1 xmm0, xmm0, xmm1
4451 vmovdqu [A0], xmm0
4452
4453 IEMIMPL_AVX_PROLOGUE
4454 EPILOGUE_3_ARGS
4455ENDPROC iemAImpl_ %+ %1 %+ _u128
4456
4457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4458 PROLOGUE_3_ARGS
4459 IEMIMPL_AVX_PROLOGUE
4460
4461 vmovdqu ymm0, [A1]
4462 vmovdqu ymm1, [A2]
4463 %1 ymm0, ymm0, ymm1
4464 vmovdqu [A0], ymm0
4465
4466 IEMIMPL_AVX_PROLOGUE
4467 EPILOGUE_3_ARGS
4468ENDPROC iemAImpl_ %+ %1 %+ _u256
4469%endmacro
4470
4471IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4472IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4473IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4474IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4475
4476
4477;;
4478; Media instruction working on one full sized and one half sized register (high half).
4479;
4480; @param 1 The instruction
4481; @param 2 1 if MMX is included, 0 if not.
4482;
4483; @param A0 Pointer to the first full sized media register operand (input/output).
4484; @param A1 Pointer to the second full sized media register operand, where we
4485; will only use the upper half as input - but we'll load it in full.
4486;
4487%macro IEMIMPL_MEDIA_F1H1 2
4488IEMIMPL_MEDIA_F1L1 %1, %2
4489%endmacro
4490
4491IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4492IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4493IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4494IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4495
4496
4497;;
4498; Media instruction working two half sized input registers (high half) and a full sized
4499; destination register (vpunpckh*).
4500;
4501; @param 1 The instruction
4502;
4503; @param A0 Pointer to the destination register (full sized, output only).
4504; @param A1 Pointer to the first full sized media source register operand, where we
4505; will only use the upper half as input - but we'll be loading it in full.
4506; @param A2 Pointer to the second full sized media source register operand, where we
4507; will only use the upper half as input - but we'll be loading it in full.
4508;
4509%macro IEMIMPL_MEDIA_F1H1H1 1
4510IEMIMPL_MEDIA_F1L1L1 %1
4511%endmacro
4512
4513IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4514IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4515IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4516IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4517
4518
4519;
4520; Shufflers with evil 8-bit immediates.
4521;
4522
4523BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4524 PROLOGUE_3_ARGS
4525 IEMIMPL_MMX_PROLOGUE
4526
4527 movzx A2, A2_8 ; must clear top bits
4528 movq mm1, [A1]
4529 movq mm0, mm0 ; paranoia!
4530 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4531 movq [A0], mm0
4532
4533 IEMIMPL_MMX_EPILOGUE
4534 EPILOGUE_3_ARGS
4535%assign bImm 0
4536%rep 256
4537.imm %+ bImm:
4538 IBT_ENDBRxx_WITHOUT_NOTRACK
4539 pshufw mm0, mm1, bImm
4540 ret
4541 %assign bImm bImm + 1
4542%endrep
4543.immEnd:
4544ENDPROC iemAImpl_pshufw_u64
4545
4546
4547%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4548BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4549 PROLOGUE_3_ARGS
4550 IEMIMPL_SSE_PROLOGUE
4551
4552 movzx A2, A2_8 ; must clear top bits
4553 movdqu xmm1, [A1]
4554 movdqu xmm0, xmm1 ; paranoia!
4555 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4556 movdqu [A0], xmm0
4557
4558 IEMIMPL_SSE_EPILOGUE
4559 EPILOGUE_3_ARGS
4560
4561 %assign bImm 0
4562 %rep 256
4563.imm %+ bImm:
4564 IBT_ENDBRxx_WITHOUT_NOTRACK
4565 %1 xmm0, xmm1, bImm
4566 ret
4567 %assign bImm bImm + 1
4568 %endrep
4569.immEnd:
4570ENDPROC iemAImpl_ %+ %1 %+ _u128
4571%endmacro
4572
4573IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4574IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4575IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4576
4577
4578%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4579BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4580 PROLOGUE_3_ARGS
4581 IEMIMPL_SSE_PROLOGUE
4582
4583 movzx A2, A2_8 ; must clear top bits
4584 vmovdqu ymm1, [A1]
4585 vmovdqu ymm0, ymm1 ; paranoia!
4586 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4587 vmovdqu [A0], ymm0
4588
4589 IEMIMPL_SSE_EPILOGUE
4590 EPILOGUE_3_ARGS
4591 %assign bImm 0
4592 %rep 256
4593.imm %+ bImm:
4594 IBT_ENDBRxx_WITHOUT_NOTRACK
4595 %1 ymm0, ymm1, bImm
4596 ret
4597 %assign bImm bImm + 1
4598 %endrep
4599.immEnd:
4600ENDPROC iemAImpl_ %+ %1 %+ _u256
4601%endmacro
4602
4603IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4604IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4605IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4606
4607
4608;
4609; Shifts with evil 8-bit immediates.
4610;
4611
4612%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4613BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4614 PROLOGUE_2_ARGS
4615 IEMIMPL_MMX_PROLOGUE
4616
4617 movzx A1, A1_8 ; must clear top bits
4618 movq mm0, [A0]
4619 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4620 movq [A0], mm0
4621
4622 IEMIMPL_MMX_EPILOGUE
4623 EPILOGUE_2_ARGS
4624%assign bImm 0
4625%rep 256
4626.imm %+ bImm:
4627 IBT_ENDBRxx_WITHOUT_NOTRACK
4628 %1 mm0, bImm
4629 ret
4630 %assign bImm bImm + 1
4631%endrep
4632.immEnd:
4633ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4634%endmacro
4635
4636IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4637IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4638IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4639IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4640IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4641IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4642IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4644
4645
4646%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4647BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4648 PROLOGUE_2_ARGS
4649 IEMIMPL_SSE_PROLOGUE
4650
4651 movzx A1, A1_8 ; must clear top bits
4652 movdqu xmm0, [A0]
4653 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4654 movdqu [A0], xmm0
4655
4656 IEMIMPL_SSE_EPILOGUE
4657 EPILOGUE_2_ARGS
4658 %assign bImm 0
4659 %rep 256
4660.imm %+ bImm:
4661 IBT_ENDBRxx_WITHOUT_NOTRACK
4662 %1 xmm0, bImm
4663 ret
4664 %assign bImm bImm + 1
4665 %endrep
4666.immEnd:
4667ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4668%endmacro
4669
4670IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4671IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4672IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4673IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4674IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4675IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4676IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4678IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4680
4681
4682;
4683; Move byte mask.
4684;
4685
4686BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4687 PROLOGUE_2_ARGS
4688 IEMIMPL_MMX_PROLOGUE
4689
4690 movq mm1, [A1]
4691 pmovmskb T0, mm1
4692 mov [A0], T0
4693%ifdef RT_ARCH_X86
4694 mov dword [A0 + 4], 0
4695%endif
4696 IEMIMPL_MMX_EPILOGUE
4697 EPILOGUE_2_ARGS
4698ENDPROC iemAImpl_pmovmskb_u64
4699
4700BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4701 PROLOGUE_2_ARGS
4702 IEMIMPL_SSE_PROLOGUE
4703
4704 movdqu xmm1, [A1]
4705 pmovmskb T0, xmm1
4706 mov [A0], T0
4707%ifdef RT_ARCH_X86
4708 mov dword [A0 + 4], 0
4709%endif
4710 IEMIMPL_SSE_EPILOGUE
4711 EPILOGUE_2_ARGS
4712ENDPROC iemAImpl_pmovmskb_u128
4713
4714BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4715 PROLOGUE_2_ARGS
4716 IEMIMPL_AVX_PROLOGUE
4717
4718 vmovdqu ymm1, [A1]
4719 vpmovmskb T0, ymm1
4720 mov [A0], T0
4721%ifdef RT_ARCH_X86
4722 mov dword [A0 + 4], 0
4723%endif
4724 IEMIMPL_AVX_EPILOGUE
4725 EPILOGUE_2_ARGS
4726ENDPROC iemAImpl_vpmovmskb_u256
4727
4728
4729;;
4730; Media instruction working on two full sized source registers and one destination (AVX).
4731;
4732; @param 1 The instruction
4733;
4734; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4735; @param A1 Pointer to the destination media register size operand (output).
4736; @param A2 Pointer to the first source media register size operand (input).
4737; @param A3 Pointer to the second source media register size operand (input).
4738;
4739; @todo r=aeichner Not used right now
4740;
4741%macro IEMIMPL_MEDIA_F3 1
4742BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4743 PROLOGUE_4_ARGS
4744 IEMIMPL_AVX_PROLOGUE
4745
4746 vmovdqu xmm0, [A2]
4747 vmovdqu xmm1, [A3]
4748 %1 xmm0, xmm0, xmm1
4749 vmovdqu [A1], xmm0
4750
4751 IEMIMPL_AVX_PROLOGUE
4752 EPILOGUE_4_ARGS
4753ENDPROC iemAImpl_ %+ %1 %+ _u128
4754
4755BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4756 PROLOGUE_4_ARGS
4757 IEMIMPL_AVX_PROLOGUE
4758
4759 vmovdqu ymm0, [A2]
4760 vmovdqu ymm1, [A3]
4761 %1 ymm0, ymm0, ymm1
4762 vmovdqu [A1], ymm0
4763
4764 IEMIMPL_AVX_PROLOGUE
4765 EPILOGUE_4_ARGS
4766ENDPROC iemAImpl_ %+ %1 %+ _u256
4767%endmacro
4768
4769;;
4770; Media instruction working on two full sized source registers and one destination (AVX),
4771; but no XSAVE state pointer argument.
4772;
4773; @param 1 The instruction
4774; @param 2 Flag whether to add a 256-bit variant (1) or not (0).
4775;
4776; @param A0 Pointer to the destination media register size operand (output).
4777; @param A1 Pointer to the first source media register size operand (input).
4778; @param A2 Pointer to the second source media register size operand (input).
4779;
4780%macro IEMIMPL_MEDIA_OPT_F3 2
4781BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4782 PROLOGUE_3_ARGS
4783 IEMIMPL_AVX_PROLOGUE
4784
4785 vmovdqu xmm0, [A1]
4786 vmovdqu xmm1, [A2]
4787 %1 xmm0, xmm0, xmm1
4788 vmovdqu [A0], xmm0
4789
4790 IEMIMPL_AVX_PROLOGUE
4791 EPILOGUE_3_ARGS
4792ENDPROC iemAImpl_ %+ %1 %+ _u128
4793
4794 %if %2 == 1
4795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4796 PROLOGUE_3_ARGS
4797 IEMIMPL_AVX_PROLOGUE
4798
4799 vmovdqu ymm0, [A1]
4800 vmovdqu ymm1, [A2]
4801 %1 ymm0, ymm0, ymm1
4802 vmovdqu [A0], ymm0
4803
4804 IEMIMPL_AVX_PROLOGUE
4805 EPILOGUE_3_ARGS
4806ENDPROC iemAImpl_ %+ %1 %+ _u256
4807 %endif
4808%endmacro
4809
4810IEMIMPL_MEDIA_OPT_F3 vpshufb, 1
4811IEMIMPL_MEDIA_OPT_F3 vpand, 1
4812IEMIMPL_MEDIA_OPT_F3 vpminub, 1
4813IEMIMPL_MEDIA_OPT_F3 vpminuw, 1
4814IEMIMPL_MEDIA_OPT_F3 vpminud, 1
4815IEMIMPL_MEDIA_OPT_F3 vpminsb, 1
4816IEMIMPL_MEDIA_OPT_F3 vpminsw, 1
4817IEMIMPL_MEDIA_OPT_F3 vpminsd, 1
4818IEMIMPL_MEDIA_OPT_F3 vpmaxub, 1
4819IEMIMPL_MEDIA_OPT_F3 vpmaxuw, 1
4820IEMIMPL_MEDIA_OPT_F3 vpmaxud, 1
4821IEMIMPL_MEDIA_OPT_F3 vpmaxsb, 1
4822IEMIMPL_MEDIA_OPT_F3 vpmaxsw, 1
4823IEMIMPL_MEDIA_OPT_F3 vpmaxsd, 1
4824IEMIMPL_MEDIA_OPT_F3 vpandn, 1
4825IEMIMPL_MEDIA_OPT_F3 vpor, 1
4826IEMIMPL_MEDIA_OPT_F3 vpxor, 1
4827IEMIMPL_MEDIA_OPT_F3 vpcmpeqb, 1
4828IEMIMPL_MEDIA_OPT_F3 vpcmpeqw, 1
4829IEMIMPL_MEDIA_OPT_F3 vpcmpeqd, 1
4830IEMIMPL_MEDIA_OPT_F3 vpcmpeqq, 1
4831IEMIMPL_MEDIA_OPT_F3 vpcmpgtb, 1
4832IEMIMPL_MEDIA_OPT_F3 vpcmpgtw, 1
4833IEMIMPL_MEDIA_OPT_F3 vpcmpgtd, 1
4834IEMIMPL_MEDIA_OPT_F3 vpcmpgtq, 1
4835IEMIMPL_MEDIA_OPT_F3 vpaddb, 1
4836IEMIMPL_MEDIA_OPT_F3 vpaddw, 1
4837IEMIMPL_MEDIA_OPT_F3 vpaddd, 1
4838IEMIMPL_MEDIA_OPT_F3 vpaddq, 1
4839IEMIMPL_MEDIA_OPT_F3 vpsubb, 1
4840IEMIMPL_MEDIA_OPT_F3 vpsubw, 1
4841IEMIMPL_MEDIA_OPT_F3 vpsubd, 1
4842IEMIMPL_MEDIA_OPT_F3 vpsubq, 1
4843IEMIMPL_MEDIA_OPT_F3 vpacksswb, 1
4844IEMIMPL_MEDIA_OPT_F3 vpackssdw, 1
4845IEMIMPL_MEDIA_OPT_F3 vpackuswb, 1
4846IEMIMPL_MEDIA_OPT_F3 vpackusdw, 1
4847IEMIMPL_MEDIA_OPT_F3 vpmullw, 1
4848IEMIMPL_MEDIA_OPT_F3 vpmulld, 1
4849IEMIMPL_MEDIA_OPT_F3 vpmulhw, 1
4850IEMIMPL_MEDIA_OPT_F3 vpmulhuw, 1
4851IEMIMPL_MEDIA_OPT_F3 vpavgb, 1
4852IEMIMPL_MEDIA_OPT_F3 vpavgw, 1
4853IEMIMPL_MEDIA_OPT_F3 vpsignb, 1
4854IEMIMPL_MEDIA_OPT_F3 vpsignw, 1
4855IEMIMPL_MEDIA_OPT_F3 vpsignd, 1
4856IEMIMPL_MEDIA_OPT_F3 vphaddw, 1
4857IEMIMPL_MEDIA_OPT_F3 vphaddd, 1
4858IEMIMPL_MEDIA_OPT_F3 vphsubw, 1
4859IEMIMPL_MEDIA_OPT_F3 vphsubd, 1
4860IEMIMPL_MEDIA_OPT_F3 vphaddsw, 1
4861IEMIMPL_MEDIA_OPT_F3 vphsubsw, 1
4862IEMIMPL_MEDIA_OPT_F3 vpmaddubsw, 1
4863IEMIMPL_MEDIA_OPT_F3 vpmulhrsw, 1
4864IEMIMPL_MEDIA_OPT_F3 vpsadbw, 1
4865IEMIMPL_MEDIA_OPT_F3 vpmuldq, 1
4866IEMIMPL_MEDIA_OPT_F3 vpmuludq, 1
4867IEMIMPL_MEDIA_OPT_F3 vunpcklps, 1
4868IEMIMPL_MEDIA_OPT_F3 vunpcklpd, 1
4869IEMIMPL_MEDIA_OPT_F3 vunpckhps, 1
4870IEMIMPL_MEDIA_OPT_F3 vunpckhpd, 1
4871IEMIMPL_MEDIA_OPT_F3 vpsubsb, 1
4872IEMIMPL_MEDIA_OPT_F3 vpsubsw, 1
4873IEMIMPL_MEDIA_OPT_F3 vpsubusb, 1
4874IEMIMPL_MEDIA_OPT_F3 vpsubusw, 1
4875IEMIMPL_MEDIA_OPT_F3 vpaddusb, 1
4876IEMIMPL_MEDIA_OPT_F3 vpaddusw, 1
4877IEMIMPL_MEDIA_OPT_F3 vpaddsb, 1
4878IEMIMPL_MEDIA_OPT_F3 vpaddsw, 1
4879IEMIMPL_MEDIA_OPT_F3 vpermilps, 1
4880IEMIMPL_MEDIA_OPT_F3 vpermilpd, 1
4881IEMIMPL_MEDIA_OPT_F3 vpmaddwd, 1
4882IEMIMPL_MEDIA_OPT_F3 vpsrlvd, 1
4883IEMIMPL_MEDIA_OPT_F3 vpsrlvq, 1
4884IEMIMPL_MEDIA_OPT_F3 vpsravd, 1
4885IEMIMPL_MEDIA_OPT_F3 vpsllvd, 1
4886IEMIMPL_MEDIA_OPT_F3 vpsllvq, 1
4887
4888IEMIMPL_MEDIA_OPT_F3 vaesenc, 0
4889IEMIMPL_MEDIA_OPT_F3 vaesenclast, 0
4890IEMIMPL_MEDIA_OPT_F3 vaesdec, 0
4891IEMIMPL_MEDIA_OPT_F3 vaesdeclast, 0
4892
4893
4894;;
4895; VAESIMC instruction.
4896;
4897; @param A0 Pointer to the first media register size operand (output).
4898; @param A1 Pointer to the second media register size operand (input).
4899;
4900BEGINPROC_FASTCALL iemAImpl_vaesimc_u128, 8
4901 PROLOGUE_2_ARGS
4902 IEMIMPL_SSE_PROLOGUE
4903
4904 movdqu xmm0, [A0]
4905 movdqu xmm1, [A1]
4906 vaesimc xmm0, xmm1
4907 movdqu [A0], xmm0
4908
4909 IEMIMPL_SSE_EPILOGUE
4910 EPILOGUE_2_ARGS
4911ENDPROC iemAImpl_vaesimc_u128
4912
4913
4914;;
4915; VAESKEYGENASSIST instruction.
4916;
4917; @param A0 Pointer to the first media register size operand (output).
4918; @param A1 Pointer to the second media register size operand (input).
4919; @param A2 8-bit immediate for the round constant.
4920;
4921BEGINPROC_FASTCALL iemAImpl_vaeskeygenassist_u128, 16
4922 PROLOGUE_3_ARGS
4923 IEMIMPL_AVX_PROLOGUE
4924
4925 movzx A2, A2_8 ; must clear top bits
4926 movdqu xmm0, [A0]
4927 movdqu xmm1, [A1]
4928 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4929 movdqu [A0], xmm0
4930
4931 IEMIMPL_AVX_EPILOGUE
4932 EPILOGUE_3_ARGS
4933 %assign bImm 0
4934 %rep 256
4935.imm %+ bImm:
4936 IBT_ENDBRxx_WITHOUT_NOTRACK
4937 vaeskeygenassist xmm0, xmm1, bImm
4938 ret
4939 int3
4940 %assign bImm bImm + 1
4941 %endrep
4942.immEnd:
4943ENDPROC iemAImpl_vaeskeygenassist_u128
4944
4945
4946;;
4947; VPERMQ instruction.
4948;
4949; @param A0 Pointer to the first media register size operand (output).
4950; @param A1 Pointer to the second media register size operand (input).
4951; @param A2 8-bit immediate for the round constant.
4952;
4953BEGINPROC_FASTCALL iemAImpl_vpermq_u256, 16
4954 PROLOGUE_3_ARGS
4955 IEMIMPL_AVX_PROLOGUE
4956
4957 movzx A2, A2_8 ; must clear top bits
4958 vmovdqu ymm1, [A1]
4959 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4960 vmovdqu [A0], ymm0
4961
4962 IEMIMPL_AVX_EPILOGUE
4963 EPILOGUE_3_ARGS
4964 %assign bImm 0
4965 %rep 256
4966.imm %+ bImm:
4967 IBT_ENDBRxx_WITHOUT_NOTRACK
4968 vpermq ymm0, ymm1, bImm
4969 ret
4970 int3
4971 %assign bImm bImm + 1
4972 %endrep
4973.immEnd:
4974ENDPROC iemAImpl_vpermq_u256
4975
4976
4977;;
4978; VPERMPD instruction.
4979;
4980; @param A0 Pointer to the first media register size operand (output).
4981; @param A1 Pointer to the second media register size operand (input).
4982; @param A2 8-bit immediate for the round constant.
4983;
4984BEGINPROC_FASTCALL iemAImpl_vpermpd_u256, 16
4985 PROLOGUE_3_ARGS
4986 IEMIMPL_AVX_PROLOGUE
4987
4988 movzx A2, A2_8 ; must clear top bits
4989 vmovdqu ymm1, [A1]
4990 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4991 vmovdqu [A0], ymm0
4992
4993 IEMIMPL_AVX_EPILOGUE
4994 EPILOGUE_3_ARGS
4995 %assign bImm 0
4996 %rep 256
4997.imm %+ bImm:
4998 IBT_ENDBRxx_WITHOUT_NOTRACK
4999 vpermpd ymm0, ymm1, bImm
5000 ret
5001 int3
5002 %assign bImm bImm + 1
5003 %endrep
5004.immEnd:
5005ENDPROC iemAImpl_vpermpd_u256
5006
5007
5008;;
5009; VPERMPS instruction.
5010;
5011; @param A0 Pointer to the first media register size operand (output).
5012; @param A1 Pointer to the second media register size operand (input).
5013; @param A2 Pointer to the third media register size operand (input).
5014;
5015BEGINPROC_FASTCALL iemAImpl_vpermps_u256, 16
5016 PROLOGUE_3_ARGS
5017 IEMIMPL_AVX_PROLOGUE
5018
5019 vmovdqu ymm0, [A1]
5020 vmovdqu ymm1, [A2]
5021 vpermps ymm0, ymm0, ymm1
5022 vmovdqu [A0], ymm0
5023
5024 IEMIMPL_AVX_EPILOGUE
5025 EPILOGUE_3_ARGS
5026ENDPROC iemAImpl_vpermps_u256
5027
5028
5029;;
5030; VPERMD instruction.
5031;
5032; @param A0 Pointer to the first media register size operand (output).
5033; @param A1 Pointer to the second media register size operand (input).
5034; @param A2 Pointer to the third media register size operand (input).
5035;
5036BEGINPROC_FASTCALL iemAImpl_vpermd_u256, 16
5037 PROLOGUE_3_ARGS
5038 IEMIMPL_AVX_PROLOGUE
5039
5040 vmovdqu ymm0, [A1]
5041 vmovdqu ymm1, [A2]
5042 vpermd ymm0, ymm0, ymm1
5043 vmovdqu [A0], ymm0
5044
5045 IEMIMPL_AVX_EPILOGUE
5046 EPILOGUE_3_ARGS
5047ENDPROC iemAImpl_vpermd_u256
5048
5049
5050;;
5051; Media instruction working on one full sized source register, one full sized destination
5052; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
5053; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
5054; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
5055; of either 16, 32, or 64, it acts like the max shift size)
5056;
5057; @param 1 The instruction
5058;
5059; @param A0 Pointer to the destination media register size operand (output).
5060; @param A1 Pointer to the first source media register size operand (input).
5061; @param A2 Pointer to the second source media register size operand (input).
5062;
5063%macro IEMIMPL_SHIFT_OPT_F3 1
5064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5065 PROLOGUE_3_ARGS
5066 IEMIMPL_AVX_PROLOGUE
5067
5068 vmovdqu xmm0, [A1]
5069 vmovdqu xmm1, [A2]
5070 %1 xmm0, xmm0, xmm1
5071 vmovdqu [A0], xmm0
5072
5073 IEMIMPL_AVX_PROLOGUE
5074 EPILOGUE_3_ARGS
5075ENDPROC iemAImpl_ %+ %1 %+ _u128
5076
5077BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5078 PROLOGUE_3_ARGS
5079 IEMIMPL_AVX_PROLOGUE
5080
5081 vmovdqu ymm0, [A1]
5082 vmovdqu xmm1, [A2]
5083 %1 ymm0, ymm0, xmm1
5084 vmovdqu [A0], ymm0
5085
5086 IEMIMPL_AVX_PROLOGUE
5087 EPILOGUE_3_ARGS
5088ENDPROC iemAImpl_ %+ %1 %+ _u256
5089%endmacro
5090
5091IEMIMPL_SHIFT_OPT_F3 vpsllw
5092IEMIMPL_SHIFT_OPT_F3 vpslld
5093IEMIMPL_SHIFT_OPT_F3 vpsllq
5094IEMIMPL_SHIFT_OPT_F3 vpsraw
5095IEMIMPL_SHIFT_OPT_F3 vpsrad
5096IEMIMPL_SHIFT_OPT_F3 vpsrlw
5097IEMIMPL_SHIFT_OPT_F3 vpsrld
5098IEMIMPL_SHIFT_OPT_F3 vpsrlq
5099
5100
5101;;
5102; Media instruction working on one full sized source registers and one destination (AVX),
5103; but no XSAVE state pointer argument.
5104;
5105; @param 1 The instruction
5106; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
5107;
5108; @param A0 Pointer to the destination media register size operand (output).
5109; @param A1 Pointer to the source media register size operand (input).
5110;
5111%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
5112BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5113 PROLOGUE_2_ARGS
5114 IEMIMPL_AVX_PROLOGUE
5115
5116 vmovdqu xmm0, [A1]
5117 %1 xmm0, xmm0
5118 vmovdqu [A0], xmm0
5119
5120 IEMIMPL_AVX_PROLOGUE
5121 EPILOGUE_2_ARGS
5122ENDPROC iemAImpl_ %+ %1 %+ _u128
5123
5124 %if %2 == 1
5125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5126 PROLOGUE_2_ARGS
5127 IEMIMPL_AVX_PROLOGUE
5128
5129 vmovdqu ymm0, [A1]
5130 %1 ymm0, ymm0
5131 vmovdqu [A0], ymm0
5132
5133 IEMIMPL_AVX_PROLOGUE
5134 EPILOGUE_2_ARGS
5135ENDPROC iemAImpl_ %+ %1 %+ _u256
5136 %endif
5137%endmacro
5138
5139IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
5140IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
5141IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
5142IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
5143
5144
5145;
5146; The SSE 4.2 crc32
5147;
5148; @param A1 Pointer to the 32-bit destination.
5149; @param A2 The source operand, sized according to the suffix.
5150;
5151BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
5152 PROLOGUE_2_ARGS
5153
5154 mov T0_32, [A0]
5155 crc32 T0_32, A1_8
5156 mov [A0], T0_32
5157
5158 EPILOGUE_2_ARGS
5159ENDPROC iemAImpl_crc32_u8
5160
5161BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5162 PROLOGUE_2_ARGS
5163
5164 mov T0_32, [A0]
5165 crc32 T0_32, A1_16
5166 mov [A0], T0_32
5167
5168 EPILOGUE_2_ARGS
5169ENDPROC iemAImpl_crc32_u16
5170
5171BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5172 PROLOGUE_2_ARGS
5173
5174 mov T0_32, [A0]
5175 crc32 T0_32, A1_32
5176 mov [A0], T0_32
5177
5178 EPILOGUE_2_ARGS
5179ENDPROC iemAImpl_crc32_u32
5180
5181%ifdef RT_ARCH_AMD64
5182BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5183 PROLOGUE_2_ARGS
5184
5185 mov T0_32, [A0]
5186 crc32 T0, A1
5187 mov [A0], T0_32
5188
5189 EPILOGUE_2_ARGS
5190ENDPROC iemAImpl_crc32_u64
5191%endif
5192
5193
5194;
5195; PTEST (SSE 4.1)
5196;
5197; @param A0 Pointer to the first source operand (aka readonly destination).
5198; @param A1 Pointer to the second source operand.
5199; @param A2 Pointer to the EFLAGS register.
5200;
5201BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5202 PROLOGUE_3_ARGS
5203 IEMIMPL_SSE_PROLOGUE
5204
5205 movdqu xmm0, [A0]
5206 movdqu xmm1, [A1]
5207 ptest xmm0, xmm1
5208 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5209
5210 IEMIMPL_SSE_EPILOGUE
5211 EPILOGUE_3_ARGS
5212ENDPROC iemAImpl_ptest_u128
5213
5214BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5215 PROLOGUE_3_ARGS
5216 IEMIMPL_SSE_PROLOGUE
5217
5218 vmovdqu ymm0, [A0]
5219 vmovdqu ymm1, [A1]
5220 vptest ymm0, ymm1
5221 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5222
5223 IEMIMPL_SSE_EPILOGUE
5224 EPILOGUE_3_ARGS
5225ENDPROC iemAImpl_vptest_u256
5226
5227
5228;; Template for the vtestp{s,d} instructions
5229;
5230; @param 1 The instruction
5231;
5232; @param A0 Pointer to the first source operand (aka readonly destination).
5233; @param A1 Pointer to the second source operand.
5234; @param A2 Pointer to the EFLAGS register.
5235;
5236%macro IEMIMPL_VTESTP_S_D 1
5237BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5238 PROLOGUE_3_ARGS
5239 IEMIMPL_AVX_PROLOGUE
5240
5241 vmovdqu xmm0, [A0]
5242 vmovdqu xmm1, [A1]
5243 %1 xmm0, xmm1
5244 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5245
5246 IEMIMPL_AVX_EPILOGUE
5247 EPILOGUE_3_ARGS
5248ENDPROC iemAImpl_ %+ %1 %+ _u128
5249
5250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5251 PROLOGUE_3_ARGS
5252 IEMIMPL_AVX_PROLOGUE
5253
5254 vmovdqu ymm0, [A0]
5255 vmovdqu ymm1, [A1]
5256 %1 ymm0, ymm1
5257 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5258
5259 IEMIMPL_AVX_EPILOGUE
5260 EPILOGUE_3_ARGS
5261ENDPROC iemAImpl_ %+ %1 %+ _u256
5262%endmacro
5263
5264IEMIMPL_VTESTP_S_D vtestps
5265IEMIMPL_VTESTP_S_D vtestpd
5266
5267
5268;;
5269; Template for the [v]pmov{s,z}x* instructions
5270;
5271; @param 1 The instruction
5272;
5273; @param A0 Pointer to the destination media register size operand (output).
5274; @param A1 The source operand value (input).
5275;
5276%macro IEMIMPL_V_PMOV_SZ_X 1
5277BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5278 PROLOGUE_2_ARGS
5279 IEMIMPL_SSE_PROLOGUE
5280
5281 movd xmm0, A1
5282 %1 xmm0, xmm0
5283 vmovdqu [A0], xmm0
5284
5285 IEMIMPL_SSE_PROLOGUE
5286 EPILOGUE_2_ARGS
5287ENDPROC iemAImpl_ %+ %1 %+ _u128
5288
5289BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5290 PROLOGUE_2_ARGS
5291 IEMIMPL_AVX_PROLOGUE
5292
5293 movd xmm0, A1
5294 v %+ %1 xmm0, xmm0
5295 vmovdqu [A0], xmm0
5296
5297 IEMIMPL_AVX_PROLOGUE
5298 EPILOGUE_2_ARGS
5299ENDPROC iemAImpl_v %+ %1 %+ _u128
5300
5301BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5302 PROLOGUE_2_ARGS
5303 IEMIMPL_AVX_PROLOGUE
5304
5305 movdqu xmm0, [A1]
5306 v %+ %1 ymm0, xmm0
5307 vmovdqu [A0], ymm0
5308
5309 IEMIMPL_AVX_PROLOGUE
5310 EPILOGUE_2_ARGS
5311ENDPROC iemAImpl_v %+ %1 %+ _u256
5312%endmacro
5313
5314IEMIMPL_V_PMOV_SZ_X pmovsxbw
5315IEMIMPL_V_PMOV_SZ_X pmovsxbd
5316IEMIMPL_V_PMOV_SZ_X pmovsxbq
5317IEMIMPL_V_PMOV_SZ_X pmovsxwd
5318IEMIMPL_V_PMOV_SZ_X pmovsxwq
5319IEMIMPL_V_PMOV_SZ_X pmovsxdq
5320
5321IEMIMPL_V_PMOV_SZ_X pmovzxbw
5322IEMIMPL_V_PMOV_SZ_X pmovzxbd
5323IEMIMPL_V_PMOV_SZ_X pmovzxbq
5324IEMIMPL_V_PMOV_SZ_X pmovzxwd
5325IEMIMPL_V_PMOV_SZ_X pmovzxwq
5326IEMIMPL_V_PMOV_SZ_X pmovzxdq
5327
5328
5329;;
5330; Initialize the SSE MXCSR register using the guest value partially to
5331; account for rounding mode, load the value from the given register.
5332;
5333; @uses 4 bytes of stack to save the original value, T0.
5334; @param 1 Expression giving the register holding the guest's MXCSR.
5335;
5336%macro SSE_AVX_LD_MXCSR 1
5337 sub xSP, 4
5338
5339 stmxcsr [xSP]
5340 mov T0_32, %1
5341 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5342 or T0_32, X86_MXCSR_XCPT_MASK
5343 sub xSP, 4
5344 mov [xSP], T0_32
5345 ldmxcsr [xSP]
5346 add xSP, 4
5347%endmacro
5348
5349
5350;;
5351; Restores the SSE MXCSR register with the original value.
5352;
5353; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5354; @param 1 Expression giving the register to return the new guest's MXCSR value.
5355; @param 2 Expression giving the register holding original guest's MXCSR value.
5356;
5357; @note Restores the stack pointer.
5358;
5359%macro SSE_AVX_ST_MXCSR 2
5360 sub xSP, 4
5361 stmxcsr [xSP]
5362 mov %1, [xSP]
5363 add xSP, 4
5364 ; Merge the status bits into the original MXCSR value.
5365 and %1, X86_MXCSR_XCPT_FLAGS
5366 ;
5367 ; If PE is set together with OE/UE and neither are masked
5368 ; PE needs to be cleared because on real hardware
5369 ; an exception is generated with only OE/UE being set,
5370 ; but because we mask all exceptions PE will get set as well.
5371 ;
5372 mov T2_32, %1
5373 and T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5374 mov T1_32, %2
5375 and T1_32, X86_MXCSR_OM | X86_MXCSR_UM
5376 shr T1_32, X86_MXCSR_XCPT_MASK_SHIFT
5377 not T1_32
5378 and T2_32, T1_32
5379 test T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5380 jz .excp_masked
5381 btr %1, X86_MXCSR_PE_BIT
5382.excp_masked:
5383 or %1, %2
5384
5385 ldmxcsr [xSP]
5386 add xSP, 4
5387%endmacro
5388
5389
5390;;
5391; Floating point instruction working on two full sized registers.
5392;
5393; @param 1 The instruction
5394; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5395;
5396; @returns R0_32 The new MXCSR value of the guest.
5397; @param A0 The guest's MXCSR register value to use.
5398; @param A1 Where to return the result.
5399; @param A2 Pointer to the first media register size operand (input/output).
5400; @param A3 Pointer to the second media register size operand (input).
5401;
5402%macro IEMIMPL_FP_F2 2
5403BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5404 PROLOGUE_4_ARGS
5405 IEMIMPL_SSE_PROLOGUE
5406 SSE_AVX_LD_MXCSR A0_32
5407
5408 movdqu xmm0, [A2]
5409 movdqu xmm1, [A3]
5410 %1 xmm0, xmm1
5411 movdqu [A1], xmm0
5412
5413 SSE_AVX_ST_MXCSR R0_32, A0_32
5414 IEMIMPL_SSE_EPILOGUE
5415 EPILOGUE_4_ARGS
5416ENDPROC iemAImpl_ %+ %1 %+ _u128
5417
5418 %if %2 == 3
5419BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5420 PROLOGUE_4_ARGS
5421 IEMIMPL_AVX_PROLOGUE
5422 SSE_AVX_LD_MXCSR A0_32
5423
5424 vmovdqu xmm0, [A2]
5425 vmovdqu xmm1, [A3]
5426 v %+ %1 xmm0, xmm0, xmm1
5427 vmovdqu [A1], xmm0
5428
5429 SSE_AVX_ST_MXCSR R0_32, A0_32
5430 IEMIMPL_AVX_EPILOGUE
5431 EPILOGUE_4_ARGS
5432ENDPROC iemAImpl_v %+ %1 %+ _u128
5433
5434BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5435 PROLOGUE_4_ARGS
5436 IEMIMPL_AVX_PROLOGUE
5437 SSE_AVX_LD_MXCSR A0_32
5438
5439 vmovdqu ymm0, [A2]
5440 vmovdqu ymm1, [A3]
5441 v %+ %1 ymm0, ymm0, ymm1
5442 vmovdqu [A1], ymm0
5443
5444 SSE_AVX_ST_MXCSR R0_32, A0_32
5445 IEMIMPL_AVX_EPILOGUE
5446 EPILOGUE_4_ARGS
5447ENDPROC iemAImpl_v %+ %1 %+ _u256
5448 %elif %2 == 2
5449BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5450 PROLOGUE_3_ARGS
5451 IEMIMPL_AVX_PROLOGUE
5452 SSE_AVX_LD_MXCSR A0_32
5453
5454 vmovdqu xmm1, [A2]
5455 v %+ %1 xmm0, xmm1
5456 vmovdqu [A1], xmm0
5457
5458 SSE_AVX_ST_MXCSR R0_32, A0_32
5459 IEMIMPL_AVX_EPILOGUE
5460 EPILOGUE_3_ARGS
5461ENDPROC iemAImpl_v %+ %1 %+ _u128
5462
5463BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5464 PROLOGUE_3_ARGS
5465 IEMIMPL_AVX_PROLOGUE
5466 SSE_AVX_LD_MXCSR A0_32
5467
5468 vmovdqu ymm1, [A2]
5469 v %+ %1 ymm0, ymm1
5470 vmovdqu [A1], ymm0
5471
5472 SSE_AVX_ST_MXCSR R0_32, A0_32
5473 IEMIMPL_AVX_EPILOGUE
5474 EPILOGUE_3_ARGS
5475ENDPROC iemAImpl_v %+ %1 %+ _u256
5476 %endif
5477%endmacro
5478
5479IEMIMPL_FP_F2 addps, 3
5480IEMIMPL_FP_F2 addpd, 3
5481IEMIMPL_FP_F2 mulps, 3
5482IEMIMPL_FP_F2 mulpd, 3
5483IEMIMPL_FP_F2 subps, 3
5484IEMIMPL_FP_F2 subpd, 3
5485IEMIMPL_FP_F2 minps, 3
5486IEMIMPL_FP_F2 minpd, 3
5487IEMIMPL_FP_F2 divps, 3
5488IEMIMPL_FP_F2 divpd, 3
5489IEMIMPL_FP_F2 maxps, 3
5490IEMIMPL_FP_F2 maxpd, 3
5491IEMIMPL_FP_F2 haddps, 3
5492IEMIMPL_FP_F2 haddpd, 3
5493IEMIMPL_FP_F2 hsubps, 3
5494IEMIMPL_FP_F2 hsubpd, 3
5495IEMIMPL_FP_F2 addsubps, 3
5496IEMIMPL_FP_F2 addsubpd, 3
5497
5498
5499;;
5500; These are actually unary operations but to keep it simple
5501; we treat them as binary for now, so the output result is
5502; always in sync with the register where the result might get written
5503; to.
5504IEMIMPL_FP_F2 sqrtps, 2
5505IEMIMPL_FP_F2 rsqrtps, 2
5506IEMIMPL_FP_F2 sqrtpd, 2
5507IEMIMPL_FP_F2 rcpps, 2
5508IEMIMPL_FP_F2 cvtdq2ps, 2
5509IEMIMPL_FP_F2 cvtps2dq, 2
5510IEMIMPL_FP_F2 cvttps2dq, 2
5511IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5512
5513
5514;;
5515; Floating point instruction working on a full sized register and a single precision operand.
5516;
5517; @param 1 The instruction
5518;
5519; @return R0_32 The new MXCSR value of the guest.
5520; @param A0 The guest's MXCSR register value to use.
5521; @param A1 Where to return the result.
5522; @param A2 Pointer to the first media register size operand (input/output).
5523; @param A3 Pointer to the second single precision floating point value (input).
5524;
5525%macro IEMIMPL_FP_F2_R32 1
5526BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5527 PROLOGUE_4_ARGS
5528 IEMIMPL_SSE_PROLOGUE
5529 SSE_AVX_LD_MXCSR A0_32
5530
5531 movdqu xmm0, [A2]
5532 movd xmm1, [A3]
5533 %1 xmm0, xmm1
5534 movdqu [A1], xmm0
5535
5536 SSE_AVX_ST_MXCSR R0_32, A0_32
5537 IEMIMPL_SSE_EPILOGUE
5538 EPILOGUE_4_ARGS
5539ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5540
5541BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5542 PROLOGUE_4_ARGS
5543 IEMIMPL_AVX_PROLOGUE
5544 SSE_AVX_LD_MXCSR A0_32
5545
5546 vmovdqu xmm0, [A2]
5547 vmovd xmm1, [A3]
5548 v %+ %1 xmm0, xmm0, xmm1
5549 vmovdqu [A1], xmm0
5550
5551 SSE_AVX_ST_MXCSR R0_32, A0_32
5552 IEMIMPL_AVX_PROLOGUE
5553 EPILOGUE_4_ARGS
5554ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5555%endmacro
5556
5557IEMIMPL_FP_F2_R32 addss
5558IEMIMPL_FP_F2_R32 mulss
5559IEMIMPL_FP_F2_R32 subss
5560IEMIMPL_FP_F2_R32 minss
5561IEMIMPL_FP_F2_R32 divss
5562IEMIMPL_FP_F2_R32 maxss
5563IEMIMPL_FP_F2_R32 cvtss2sd
5564IEMIMPL_FP_F2_R32 sqrtss
5565IEMIMPL_FP_F2_R32 rsqrtss
5566IEMIMPL_FP_F2_R32 rcpss
5567
5568
5569;;
5570; Floating point instruction working on a full sized register and a double precision operand.
5571;
5572; @param 1 The instruction
5573;
5574; @return R0_32 The new MXCSR value of the guest.
5575; @param A0 The guest's MXCSR register value to use.
5576; @param A1 Where to return the result.
5577; @param A2 Pointer to the first media register size operand (input/output).
5578; @param A3 Pointer to the second double precision floating point value (input).
5579;
5580%macro IEMIMPL_FP_F2_R64 1
5581BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5582 PROLOGUE_4_ARGS
5583 IEMIMPL_SSE_PROLOGUE
5584 SSE_AVX_LD_MXCSR A0_32
5585
5586 movdqu xmm0, [A2]
5587 movq xmm1, [A3]
5588 %1 xmm0, xmm1
5589 movdqu [A1], xmm0
5590
5591 SSE_AVX_ST_MXCSR R0_32, A0_32
5592 IEMIMPL_SSE_EPILOGUE
5593 EPILOGUE_4_ARGS
5594ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5595
5596BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5597 PROLOGUE_4_ARGS
5598 IEMIMPL_AVX_PROLOGUE
5599 SSE_AVX_LD_MXCSR A0_32
5600
5601 vmovdqu xmm0, [A2]
5602 vmovq xmm1, [A3]
5603 v %+ %1 xmm0, xmm0, xmm1
5604 vmovdqu [A1], xmm0
5605
5606 SSE_AVX_ST_MXCSR R0_32, A0_32
5607 IEMIMPL_AVX_EPILOGUE
5608 EPILOGUE_4_ARGS
5609ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5610%endmacro
5611
5612IEMIMPL_FP_F2_R64 addsd
5613IEMIMPL_FP_F2_R64 mulsd
5614IEMIMPL_FP_F2_R64 subsd
5615IEMIMPL_FP_F2_R64 minsd
5616IEMIMPL_FP_F2_R64 divsd
5617IEMIMPL_FP_F2_R64 maxsd
5618IEMIMPL_FP_F2_R64 cvtsd2ss
5619IEMIMPL_FP_F2_R64 sqrtsd
5620
5621
5622;;
5623; Macro for the cvtpd2ps/cvtps2pd instructions.
5624;
5625; 1 The instruction name.
5626; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5627;
5628; @return R0_32 The new MXCSR value of the guest.
5629; @param A0_32 The guest's MXCSR register value to use.
5630; @param A1 Where to return the result.
5631; @param A2 Pointer to the first media register size operand (input/output).
5632; @param A3 Pointer to the second media register size operand (input).
5633;
5634%macro IEMIMPL_CVT_F2 2
5635BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5636 PROLOGUE_4_ARGS
5637 IEMIMPL_SSE_PROLOGUE
5638 SSE_AVX_LD_MXCSR A0_32
5639
5640 movdqu xmm0, [A2]
5641 movdqu xmm1, [A3]
5642 %1 xmm0, xmm1
5643 movdqu [A1], xmm0
5644
5645 SSE_AVX_ST_MXCSR R0_32, A0_32
5646 IEMIMPL_SSE_EPILOGUE
5647 EPILOGUE_4_ARGS
5648ENDPROC iemAImpl_ %+ %1 %+ _u128
5649
5650BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u128, 16
5651 PROLOGUE_4_ARGS
5652 IEMIMPL_AVX_PROLOGUE
5653 SSE_AVX_LD_MXCSR A0_32
5654
5655 vmovdqu xmm1, [A2]
5656 v %+ %1 xmm0, xmm1
5657 vmovdqu [A1], xmm0
5658
5659 SSE_AVX_ST_MXCSR R0_32, A0_32
5660 IEMIMPL_AVX_EPILOGUE
5661 EPILOGUE_4_ARGS
5662ENDPROC iemAImpl_v %+ %1 %+ _u128_u128
5663
5664BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u256, 16
5665 PROLOGUE_4_ARGS
5666 IEMIMPL_AVX_PROLOGUE
5667 SSE_AVX_LD_MXCSR A0_32
5668
5669 vmovdqu xmm1, [A2]
5670 %if %2 == 0
5671 v %+ %1 xmm0, xmm1
5672 %else
5673 v %+ %1 ymm0, xmm1
5674 %endif
5675 vmovdqu [A1], ymm0
5676
5677 SSE_AVX_ST_MXCSR R0_32, A0_32
5678 IEMIMPL_AVX_EPILOGUE
5679 EPILOGUE_4_ARGS
5680ENDPROC iemAImpl_v %+ %1 %+ _u128_u256
5681%endmacro
5682
5683IEMIMPL_CVT_F2 cvtpd2ps, 0
5684IEMIMPL_CVT_F2 cvttpd2dq, 0
5685IEMIMPL_CVT_F2 cvtpd2dq, 0
5686
5687;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5688
5689BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5690 PROLOGUE_3_ARGS
5691 IEMIMPL_SSE_PROLOGUE
5692 SSE_AVX_LD_MXCSR A0_32
5693
5694 cvtps2pd xmm0, [A2]
5695 movdqu [A1], xmm0
5696
5697 SSE_AVX_ST_MXCSR R0_32, A0_32
5698 IEMIMPL_SSE_EPILOGUE
5699 EPILOGUE_3_ARGS
5700ENDPROC iemAImpl_cvtps2pd_u128
5701
5702
5703;;
5704; vcvtps2pd instruction - 128-bit variant.
5705;
5706; @return R0_32 The new MXCSR value of the guest.
5707; @param A0_32 The guest's MXCSR register value to use.
5708; @param A1 Pointer to the result operand (output).
5709; @param A2 Pointer to the second operand (input).
5710;
5711BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u128_u64, 16
5712 PROLOGUE_3_ARGS
5713 IEMIMPL_AVX_PROLOGUE
5714 SSE_AVX_LD_MXCSR A0_32
5715
5716 vcvtps2pd xmm0, qword [A2]
5717 movdqu [A1], xmm0
5718
5719 SSE_AVX_ST_MXCSR R0_32, A0_32
5720 IEMIMPL_AVX_EPILOGUE
5721 EPILOGUE_3_ARGS
5722ENDPROC iemAImpl_vcvtps2pd_u128_u64
5723
5724
5725;;
5726; vcvtps2pd instruction - 256-bit variant.
5727;
5728; @return R0_32 The new MXCSR value of the guest.
5729; @param A0_32 The guest's MXCSR register value to use.
5730; @param A1 Pointer to the result operand (output).
5731; @param A2 Pointer to the second operand (input).
5732;
5733BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u256_u128, 16
5734 PROLOGUE_3_ARGS
5735 IEMIMPL_AVX_PROLOGUE
5736 SSE_AVX_LD_MXCSR A0_32
5737
5738 movdqu xmm0, [A2]
5739 vcvtps2pd ymm0, xmm1
5740 vmovdqu [A1], ymm0
5741
5742 SSE_AVX_ST_MXCSR R0_32, A0_32
5743 IEMIMPL_AVX_EPILOGUE
5744 EPILOGUE_3_ARGS
5745ENDPROC iemAImpl_vcvtps2pd_u256_u128
5746
5747
5748;;
5749; vcvtdq2pd instruction - 128-bit variant.
5750;
5751; @return R0_32 The new MXCSR value of the guest.
5752; @param A0_32 The guest's MXCSR register value to use.
5753; @param A1 Pointer to the result operand (output).
5754; @param A2 Pointer to the second operand (input).
5755;
5756BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u128_u64, 16
5757 PROLOGUE_3_ARGS
5758 IEMIMPL_AVX_PROLOGUE
5759 SSE_AVX_LD_MXCSR A0_32
5760
5761 vcvtdq2pd xmm0, qword [A2]
5762 movdqu [A1], xmm0
5763
5764 SSE_AVX_ST_MXCSR R0_32, A0_32
5765 IEMIMPL_AVX_EPILOGUE
5766 EPILOGUE_3_ARGS
5767ENDPROC iemAImpl_vcvtdq2pd_u128_u64
5768
5769
5770;;
5771; vcvtdq2pd instruction - 256-bit variant.
5772;
5773; @return R0_32 The new MXCSR value of the guest.
5774; @param A0_32 The guest's MXCSR register value to use.
5775; @param A1 Pointer to the result operand (output).
5776; @param A2 Pointer to the second operand (input).
5777;
5778BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u256_u128, 16
5779 PROLOGUE_3_ARGS
5780 IEMIMPL_AVX_PROLOGUE
5781 SSE_AVX_LD_MXCSR A0_32
5782
5783 movdqu xmm0, [A2]
5784 vcvtdq2pd ymm0, xmm1
5785 vmovdqu [A1], ymm0
5786
5787 SSE_AVX_ST_MXCSR R0_32, A0_32
5788 IEMIMPL_AVX_EPILOGUE
5789 EPILOGUE_3_ARGS
5790ENDPROC iemAImpl_vcvtdq2pd_u256_u128
5791
5792
5793;;
5794; shufps instructions with 8-bit immediates.
5795;
5796; @param A0 Pointer to the destination media register size operand (input/output).
5797; @param A1 Pointer to the first source media register size operand (input).
5798; @param A2 The 8-bit immediate
5799;
5800BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5801 PROLOGUE_3_ARGS
5802 IEMIMPL_SSE_PROLOGUE
5803
5804 movzx A2, A2_8 ; must clear top bits
5805 movdqu xmm0, [A0]
5806 movdqu xmm1, [A1]
5807 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5808 movdqu [A0], xmm0
5809
5810 IEMIMPL_SSE_EPILOGUE
5811 EPILOGUE_3_ARGS
5812 %assign bImm 0
5813 %rep 256
5814.imm %+ bImm:
5815 IBT_ENDBRxx_WITHOUT_NOTRACK
5816 shufps xmm0, xmm1, bImm
5817 ret
5818 int3
5819 %assign bImm bImm + 1
5820 %endrep
5821.immEnd:
5822ENDPROC iemAImpl_shufps_u128
5823
5824
5825;;
5826; shufpd instruction with 8-bit immediates.
5827;
5828; @param A0 Pointer to the destination media register size operand (input/output).
5829; @param A1 Pointer to the first source media register size operand (input).
5830; @param A2 The 8-bit immediate
5831;
5832BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5833 PROLOGUE_3_ARGS
5834 IEMIMPL_SSE_PROLOGUE
5835
5836 movzx A2, A2_8 ; must clear top bits
5837 movdqu xmm0, [A0]
5838 movdqu xmm1, [A1]
5839 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5840 movdqu [A0], xmm0
5841
5842 IEMIMPL_SSE_EPILOGUE
5843 EPILOGUE_3_ARGS
5844 %assign bImm 0
5845 %rep 256
5846.imm %+ bImm:
5847 IBT_ENDBRxx_WITHOUT_NOTRACK
5848 shufpd xmm0, xmm1, bImm
5849 ret
5850 %assign bImm bImm + 1
5851 %endrep
5852.immEnd:
5853ENDPROC iemAImpl_shufpd_u128
5854
5855
5856;;
5857; vshufp{s,d} instructions with 8-bit immediates.
5858;
5859; @param 1 The instruction name.
5860;
5861; @param A0 Pointer to the destination media register size operand (output).
5862; @param A1 Pointer to the first source media register size operand (input).
5863; @param A2 Pointer to the second source media register size operand (input).
5864; @param A3 The 8-bit immediate
5865;
5866%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5867BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5868 PROLOGUE_4_ARGS
5869 IEMIMPL_AVX_PROLOGUE
5870
5871 movzx A3, A3_8 ; must clear top bits
5872 movdqu xmm0, [A1]
5873 movdqu xmm1, [A2]
5874 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5875 movdqu [A0], xmm0
5876
5877 IEMIMPL_AVX_EPILOGUE
5878 EPILOGUE_4_ARGS
5879 %assign bImm 0
5880 %rep 256
5881.imm %+ bImm:
5882 IBT_ENDBRxx_WITHOUT_NOTRACK
5883 %1 xmm0, xmm0, xmm1, bImm
5884 ret
5885 %assign bImm bImm + 1
5886 %endrep
5887.immEnd:
5888ENDPROC iemAImpl_ %+ %1 %+ _u128
5889
5890BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5891 PROLOGUE_4_ARGS
5892 IEMIMPL_AVX_PROLOGUE
5893
5894 movzx A3, A3_8 ; must clear top bits
5895 vmovdqu ymm0, [A1]
5896 vmovdqu ymm1, [A2]
5897 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5898 vmovdqu [A0], ymm0
5899
5900 IEMIMPL_AVX_EPILOGUE
5901 EPILOGUE_4_ARGS
5902 %assign bImm 0
5903 %rep 256
5904.imm %+ bImm:
5905 IBT_ENDBRxx_WITHOUT_NOTRACK
5906 %1 ymm0, ymm0, ymm1, bImm
5907 ret
5908 %assign bImm bImm + 1
5909 %endrep
5910.immEnd:
5911ENDPROC iemAImpl_ %+ %1 %+ _u256
5912%endmacro
5913
5914IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5915IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5916
5917
5918;;
5919; One of the [p]blendv{b,ps,pd} variants
5920;
5921; @param 1 The instruction
5922;
5923; @param A0 Pointer to the first media register sized operand (input/output).
5924; @param A1 Pointer to the second media sized value (input).
5925; @param A2 Pointer to the media register sized mask value (input).
5926;
5927%macro IEMIMPL_P_BLEND 1
5928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5929 PROLOGUE_3_ARGS
5930 IEMIMPL_SSE_PROLOGUE
5931
5932 movdqu xmm0, [A2] ; This is implicit
5933 movdqu xmm1, [A0]
5934 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5935 %1 xmm1, xmm2
5936 movdqu [A0], xmm1
5937
5938 IEMIMPL_SSE_PROLOGUE
5939 EPILOGUE_3_ARGS
5940ENDPROC iemAImpl_ %+ %1 %+ _u128
5941%endmacro
5942
5943IEMIMPL_P_BLEND pblendvb
5944IEMIMPL_P_BLEND blendvps
5945IEMIMPL_P_BLEND blendvpd
5946
5947
5948;;
5949; One of the v[p]blendv{b,ps,pd} variants
5950;
5951; @param 1 The instruction
5952;
5953; @param A0 Pointer to the first media register sized operand (output).
5954; @param A1 Pointer to the first media register sized operand (input).
5955; @param A2 Pointer to the second media register sized operand (input).
5956; @param A3 Pointer to the media register sized mask value (input).
5957%macro IEMIMPL_AVX_P_BLEND 1
5958BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5959 PROLOGUE_4_ARGS
5960 IEMIMPL_AVX_PROLOGUE
5961
5962 vmovdqu xmm0, [A1]
5963 vmovdqu xmm1, [A2]
5964 vmovdqu xmm2, [A3]
5965 %1 xmm0, xmm0, xmm1, xmm2
5966 vmovdqu [A0], xmm0
5967
5968 IEMIMPL_AVX_PROLOGUE
5969 EPILOGUE_4_ARGS
5970ENDPROC iemAImpl_ %+ %1 %+ _u128
5971
5972BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5973 PROLOGUE_4_ARGS
5974 IEMIMPL_AVX_PROLOGUE
5975
5976 vmovdqu ymm0, [A1]
5977 vmovdqu ymm1, [A2]
5978 vmovdqu ymm2, [A3]
5979 %1 ymm0, ymm0, ymm1, ymm2
5980 vmovdqu [A0], ymm0
5981
5982 IEMIMPL_AVX_PROLOGUE
5983 EPILOGUE_4_ARGS
5984ENDPROC iemAImpl_ %+ %1 %+ _u256
5985%endmacro
5986
5987IEMIMPL_AVX_P_BLEND vpblendvb
5988IEMIMPL_AVX_P_BLEND vblendvps
5989IEMIMPL_AVX_P_BLEND vblendvpd
5990
5991
5992;;
5993; palignr mm1, mm2/m64 instruction.
5994;
5995; @param A0 Pointer to the first media register sized operand (output).
5996; @param A1 The second register sized operand (input).
5997; @param A2 The 8-bit immediate.
5998BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5999 PROLOGUE_3_ARGS
6000 IEMIMPL_MMX_PROLOGUE
6001
6002 movzx A2, A2_8 ; must clear top bits
6003 movq mm0, [A0]
6004 movq mm1, A1
6005 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6006 movq [A0], mm0
6007
6008 IEMIMPL_MMX_EPILOGUE
6009 EPILOGUE_3_ARGS
6010 %assign bImm 0
6011 %rep 256
6012.imm %+ bImm:
6013 IBT_ENDBRxx_WITHOUT_NOTRACK
6014 palignr mm0, mm1, bImm
6015 ret
6016 %assign bImm bImm + 1
6017 %endrep
6018.immEnd:
6019ENDPROC iemAImpl_palignr_u64
6020
6021
6022;;
6023; SSE instructions with 8-bit immediates of the form
6024; xxx xmm1, xmm2, imm8.
6025; where the instruction encoding takes up 6 bytes.
6026;
6027; @param 1 The instruction name.
6028;
6029; @param A0 Pointer to the first media register size operand (input/output).
6030; @param A1 Pointer to the second source media register size operand (input).
6031; @param A2 The 8-bit immediate
6032;
6033%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
6034BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6035 PROLOGUE_3_ARGS
6036 IEMIMPL_SSE_PROLOGUE
6037
6038 movzx A2, A2_8 ; must clear top bits
6039 movdqu xmm0, [A0]
6040 movdqu xmm1, [A1]
6041 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
6042 movdqu [A0], xmm0
6043
6044 IEMIMPL_SSE_EPILOGUE
6045 EPILOGUE_3_ARGS
6046 %assign bImm 0
6047 %rep 256
6048.imm %+ bImm:
6049 IBT_ENDBRxx_WITHOUT_NOTRACK
6050 %1 xmm0, xmm1, bImm
6051 ret
6052 int3
6053 %assign bImm bImm + 1
6054 %endrep
6055.immEnd:
6056ENDPROC iemAImpl_ %+ %1 %+ _u128
6057%endmacro
6058
6059IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
6060IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
6061IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
6062IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
6063IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
6064IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
6065IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
6066
6067
6068;;
6069; AVX instructions with 8-bit immediates of the form
6070; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
6071; where the instruction encoding takes up 6 bytes.
6072;
6073; @param 1 The instruction name.
6074; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6075; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6076;
6077; @param A0 Pointer to the destination media register size operand (output).
6078; @param A1 Pointer to the first source media register size operand (input).
6079; @param A2 Pointer to the second source media register size operand (input).
6080; @param A3 The 8-bit immediate
6081;
6082%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
6083 %if %2 == 1
6084BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6085 PROLOGUE_4_ARGS
6086 IEMIMPL_AVX_PROLOGUE
6087
6088 movzx A3, A3_8 ; must clear top bits
6089 movdqu xmm0, [A1]
6090 movdqu xmm1, [A2]
6091 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6092 movdqu [A0], xmm0
6093
6094 IEMIMPL_AVX_EPILOGUE
6095 EPILOGUE_4_ARGS
6096 %assign bImm 0
6097 %rep 256
6098.imm %+ bImm:
6099 IBT_ENDBRxx_WITHOUT_NOTRACK
6100 %1 xmm0, xmm0, xmm1, bImm
6101 ret
6102 int3
6103 %assign bImm bImm + 1
6104 %endrep
6105.immEnd:
6106ENDPROC iemAImpl_ %+ %1 %+ _u128
6107 %endif
6108
6109 %if %3 == 1
6110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
6111 PROLOGUE_4_ARGS
6112 IEMIMPL_AVX_PROLOGUE
6113
6114 movzx A3, A3_8 ; must clear top bits
6115 vmovdqu ymm0, [A1]
6116 vmovdqu ymm1, [A2]
6117 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6118 vmovdqu [A0], ymm0
6119
6120 IEMIMPL_AVX_EPILOGUE
6121 EPILOGUE_4_ARGS
6122 %assign bImm 0
6123 %rep 256
6124.imm %+ bImm:
6125 IBT_ENDBRxx_WITHOUT_NOTRACK
6126 %1 ymm0, ymm0, ymm1, bImm
6127 ret
6128 int3
6129 %assign bImm bImm + 1
6130 %endrep
6131.immEnd:
6132ENDPROC iemAImpl_ %+ %1 %+ _u256
6133 %endif
6134%endmacro
6135
6136IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
6137IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
6138IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
6139IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
6140IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
6141IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
6142IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
6143IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
6144IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
6145
6146
6147;;
6148; AVX instructions with 8-bit immediates of the form
6149; xxx {x,y}mm1, {x,y}mm2, imm8.
6150; where the instruction encoding takes up 6 bytes.
6151;
6152; @param 1 The instruction name.
6153; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6154; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6155; @param 4 The number of bytes taken up by a single instance of the instruction.
6156;
6157; @param A0 Pointer to the destination media register size operand (output).
6158; @param A1 Pointer to the first source media register size operand (input).
6159; @param A2 The 8-bit immediate
6160;
6161%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
6162 %if %2 == 1
6163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
6164 PROLOGUE_4_ARGS
6165 IEMIMPL_AVX_PROLOGUE
6166
6167 movzx A2, A2_8 ; must clear top bits
6168 movdqu xmm1, [A1]
6169 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6170 movdqu [A0], xmm0
6171
6172 IEMIMPL_AVX_EPILOGUE
6173 EPILOGUE_4_ARGS
6174 %assign bImm 0
6175 %rep 256
6176.imm %+ bImm:
6177 IBT_ENDBRxx_WITHOUT_NOTRACK
6178 %1 xmm0, xmm1, bImm
6179 ret
6180 int3
6181 %assign bImm bImm + 1
6182 %endrep
6183.immEnd:
6184ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
6185 %endif
6186
6187 %if %3 == 1
6188BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
6189 PROLOGUE_4_ARGS
6190 IEMIMPL_AVX_PROLOGUE
6191
6192 movzx A2, A2_8 ; must clear top bits
6193 vmovdqu ymm1, [A1]
6194 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6195 vmovdqu [A0], ymm0
6196
6197 IEMIMPL_AVX_EPILOGUE
6198 EPILOGUE_4_ARGS
6199 %assign bImm 0
6200 %rep 256
6201.imm %+ bImm:
6202 IBT_ENDBRxx_WITHOUT_NOTRACK
6203 %1 ymm0, ymm1, bImm
6204 ret
6205 int3
6206 %assign bImm bImm + 1
6207 %endrep
6208.immEnd:
6209ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
6210 %endif
6211%endmacro
6212
6213IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
6214IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
6215IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
6216IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
6217
6218
6219;;
6220; Need to move this as well somewhere better?
6221;
6222struc IEMPCMPISTRXSRC
6223 .uSrc1 resd 4
6224 .uSrc2 resd 4
6225endstruc
6226
6227struc IEMPCMPESTRXSRC
6228 .uSrc1 resd 4
6229 .uSrc2 resd 4
6230 .u64Rax resd 2
6231 .u64Rdx resd 2
6232endstruc
6233
6234;;
6235; The pcmpistri/vcmpistri instruction.
6236;
6237; @param 1 The instruction name
6238;
6239; @return R0_32 The new ECX value.
6240; @param A0 Pointer to the EFLAGS register.
6241; @param A1 Pointer to the first operand (input).
6242; @param A2 Pointer to the second operand (input).
6243; @param A3 The 8-bit immediate
6244;
6245%macro IEMIMPL_MEDIA_V_CMPISTRI 1
6246BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6247 PROLOGUE_4_ARGS
6248 IEMIMPL_SSE_PROLOGUE
6249
6250 movzx A3, A3_8 ; must clear top bits
6251 movdqu xmm0, [A1]
6252 movdqu xmm1, [A2]
6253 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6254 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6255
6256 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6257 mov R0_32, ecx
6258
6259 IEMIMPL_SSE_EPILOGUE
6260 EPILOGUE_4_ARGS
6261 %assign bImm 0
6262 %rep 256
6263.imm %+ bImm:
6264 IBT_ENDBRxx_WITHOUT_NOTRACK
6265 %1 xmm0, xmm1, bImm
6266 ret
6267 int3
6268 %assign bImm bImm + 1
6269 %endrep
6270.immEnd:
6271ENDPROC iemAImpl_ %+ %1 %+ _u128
6272%endmacro
6273
6274IEMIMPL_MEDIA_V_CMPISTRI pcmpistri
6275IEMIMPL_MEDIA_V_CMPISTRI vpcmpistri
6276
6277
6278;;
6279; The pcmpestri instruction.
6280;
6281; @param 1 The instruction name
6282;
6283; @param A0 Pointer to the ECX register to store the result to (output).
6284; @param A1 Pointer to the EFLAGS register.
6285; @param A2 Pointer to the structure containing the source operands (input).
6286; @param A3 The 8-bit immediate
6287;
6288BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6289 PROLOGUE_4_ARGS
6290 IEMIMPL_SSE_PROLOGUE
6291
6292 movzx A3, A3_8 ; must clear top bits
6293 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6294 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6295 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6296 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6297 push xDX ; xDX can be A1 or A2 depending on the calling convention
6298 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6299 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6300 IBT_NOTRACK
6301 call T1
6302
6303 pop xDX
6304 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6305 mov [T2], ecx
6306
6307 IEMIMPL_SSE_EPILOGUE
6308 EPILOGUE_4_ARGS
6309 %assign bImm 0
6310 %rep 256
6311.imm %+ bImm:
6312 IBT_ENDBRxx_WITHOUT_NOTRACK
6313 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6314 pcmpestri xmm0, xmm1, bImm
6315 ret
6316 %assign bImm bImm + 1
6317 %endrep
6318.immEnd:
6319ENDPROC iemAImpl_pcmpestri_u128
6320
6321
6322;;
6323; The vpcmpestri instruction.
6324;
6325; @param 1 The instruction name
6326;
6327; @param A0 Pointer to the ECX register to store the result to (output).
6328; @param A1 Pointer to the EFLAGS register.
6329; @param A2 Pointer to the structure containing the source operands (input).
6330; @param A3 The 8-bit immediate
6331;
6332BEGINPROC_FASTCALL iemAImpl_vpcmpestri_u128, 16
6333 PROLOGUE_4_ARGS
6334 IEMIMPL_SSE_PROLOGUE
6335
6336 movzx A3, A3_8 ; must clear top bits
6337 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6338 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6339 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6340 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6341 push xDX ; xDX can be A1 or A2 depending on the calling convention
6342 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6343 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6344 IBT_NOTRACK
6345 call T1
6346
6347 pop xDX
6348 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6349 mov [T2], ecx
6350
6351 IEMIMPL_SSE_EPILOGUE
6352 EPILOGUE_4_ARGS
6353 %assign bImm 0
6354 %rep 256
6355.imm %+ bImm:
6356 IBT_ENDBRxx_WITHOUT_NOTRACK
6357 db 0xc4, 0xe3, 0xf9, 0x61, 0xc1, bImm ; vpcmpestri xmm0,xmm1,0x1 with VEX.W set
6358 ret
6359 int3
6360 %assign bImm bImm + 1
6361 %endrep
6362.immEnd:
6363ENDPROC iemAImpl_vpcmpestri_u128
6364
6365
6366;;
6367; The pcmpistrm/vpcmpistrm instruction template.
6368;
6369; @param 1 The instruction name
6370;
6371; @param A0 Pointer to the XMM0 register to store the result to (output).
6372; @param A1 Pointer to the EFLAGS register.
6373; @param A2 Pointer to the structure containing the source operands (input).
6374; @param A3 The 8-bit immediate
6375;
6376%macro IEMIMPL_MEDIA_V_CMPISTRM 1
6377BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6378 PROLOGUE_4_ARGS
6379 IEMIMPL_SSE_PROLOGUE
6380
6381 movzx A3, A3_8 ; must clear top bits
6382 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6383 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6384 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6385
6386 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6387 movdqu [A0], xmm0
6388
6389 IEMIMPL_SSE_EPILOGUE
6390 EPILOGUE_4_ARGS
6391 %assign bImm 0
6392 %rep 256
6393.imm %+ bImm:
6394 IBT_ENDBRxx_WITHOUT_NOTRACK
6395 %1 xmm1, xmm2, bImm
6396 ret
6397 int3
6398 %assign bImm bImm + 1
6399 %endrep
6400.immEnd:
6401ENDPROC iemAImpl_ %+ %1 %+ _u128
6402%endmacro
6403
6404IEMIMPL_MEDIA_V_CMPISTRM pcmpistrm
6405IEMIMPL_MEDIA_V_CMPISTRM vpcmpistrm
6406
6407
6408;;
6409; The pcmpestrm instruction.
6410;
6411; @param A0 Pointer to the XMM0 register to store the result to (output).
6412; @param A1 Pointer to the EFLAGS register.
6413; @param A2 Pointer to the structure containing the source operands (input).
6414; @param A3 The 8-bit immediate
6415;
6416BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6417 PROLOGUE_4_ARGS
6418 IEMIMPL_SSE_PROLOGUE
6419
6420 movzx A3, A3_8 ; must clear top bits
6421 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6422 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6423 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6424 push xDX ; xDX can be A1 or A2 depending on the calling convention
6425 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6426 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6427 IBT_NOTRACK
6428 call T1
6429
6430 pop xDX
6431 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6432 movdqu [A0], xmm0
6433
6434 IEMIMPL_SSE_EPILOGUE
6435 EPILOGUE_4_ARGS
6436 %assign bImm 0
6437 %rep 256
6438.imm %+ bImm:
6439 IBT_ENDBRxx_WITHOUT_NOTRACK
6440 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6441 pcmpestrm xmm1, xmm2, bImm
6442 ret
6443 %assign bImm bImm + 1
6444 %endrep
6445.immEnd:
6446ENDPROC iemAImpl_pcmpestrm_u128
6447
6448
6449;;
6450; The vpcmpestrm instruction.
6451;
6452; @param A0 Pointer to the XMM0 register to store the result to (output).
6453; @param A1 Pointer to the EFLAGS register.
6454; @param A2 Pointer to the structure containing the source operands (input).
6455; @param A3 The 8-bit immediate
6456;
6457BEGINPROC_FASTCALL iemAImpl_vpcmpestrm_u128, 16
6458 PROLOGUE_4_ARGS
6459 IEMIMPL_SSE_PROLOGUE
6460
6461 movzx A3, A3_8 ; must clear top bits
6462 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6463 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6464 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6465 push xDX ; xDX can be A1 or A2 depending on the calling convention
6466 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6467 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6468 IBT_NOTRACK
6469 call T1
6470
6471 pop xDX
6472 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6473 movdqu [A0], xmm0
6474
6475 IEMIMPL_SSE_EPILOGUE
6476 EPILOGUE_4_ARGS
6477 %assign bImm 0
6478 %rep 256
6479.imm %+ bImm:
6480 IBT_ENDBRxx_WITHOUT_NOTRACK
6481 db 0xc4, 0xe3, 0xf9, 0x60, 0xca, bImm ; vpcmpestrm xmm1, xmm2, bImm with VEX.W set
6482 ret
6483 int3
6484 %assign bImm bImm + 1
6485 %endrep
6486.immEnd:
6487ENDPROC iemAImpl_vpcmpestrm_u128
6488
6489
6490;;
6491; movmskp{s,d} SSE instruction template
6492;
6493; @param 1 The SSE instruction name.
6494; @param 2 The AVX instruction name.
6495;
6496; @param A0 Pointer to the output register (output/byte sized).
6497; @param A1 Pointer to the source media register size operand (input).
6498;
6499%macro IEMIMPL_MEDIA_MOVMSK_P 2
6500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6501 PROLOGUE_2_ARGS
6502 IEMIMPL_SSE_PROLOGUE
6503
6504 movdqu xmm0, [A1]
6505 %1 T0, xmm0
6506 mov byte [A0], T0_8
6507
6508 IEMIMPL_SSE_EPILOGUE
6509 EPILOGUE_2_ARGS
6510ENDPROC iemAImpl_ %+ %1 %+ _u128
6511
6512BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6513 PROLOGUE_2_ARGS
6514 IEMIMPL_AVX_PROLOGUE
6515
6516 movdqu xmm0, [A1]
6517 %2 T0, xmm0
6518 mov byte [A0], T0_8
6519
6520 IEMIMPL_AVX_EPILOGUE
6521 EPILOGUE_2_ARGS
6522ENDPROC iemAImpl_ %+ %2 %+ _u128
6523
6524BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6525 PROLOGUE_2_ARGS
6526 IEMIMPL_AVX_PROLOGUE
6527
6528 vmovdqu ymm0, [A1]
6529 %2 T0, ymm0
6530 mov byte [A0], T0_8
6531
6532 IEMIMPL_AVX_EPILOGUE
6533 EPILOGUE_2_ARGS
6534ENDPROC iemAImpl_ %+ %2 %+ _u256
6535%endmacro
6536
6537IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6538IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6539
6540
6541;;
6542; Template for [v]cvttss2si/[v]cvtss2si instructions.
6543;
6544; @param 1 Instruction name.
6545; @param 2 AVX or SSE
6546;
6547; @return R0_32 The new MXCSR value of the guest.
6548; @param A0_32 The guest's MXCSR register value to use.
6549; @param A1 Pointer to the result operand (output).
6550; @param A2 Pointer to the second operand (input).
6551;
6552%macro IEMIMPL_MEDIA_V_CVTXSS2SI 2
6553BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r32, 16
6554 PROLOGUE_3_ARGS
6555 IEMIMPL_ %+ %2 %+ _PROLOGUE
6556 SSE_AVX_LD_MXCSR A0_32
6557
6558 %1 T0_32, [A2]
6559 mov dword [A1], T0_32
6560
6561 SSE_AVX_ST_MXCSR R0_32, A0_32
6562 IEMIMPL_ %+ %2 %+ _EPILOGUE
6563 EPILOGUE_3_ARGS
6564ENDPROC iemAImpl_ %+ %1 %+ _i32_r32
6565
6566
6567BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r32, 16
6568 PROLOGUE_3_ARGS
6569 IEMIMPL_ %+ %2 %+ _PROLOGUE
6570 SSE_AVX_LD_MXCSR A0_32
6571
6572 %1 T0, [A2]
6573 mov qword [A1], T0
6574
6575 SSE_AVX_ST_MXCSR R0_32, A0_32
6576 IEMIMPL_ %+ %2 %+ _EPILOGUE
6577 EPILOGUE_3_ARGS
6578ENDPROC iemAImpl_ %+ %1 %+ _i64_r32
6579%endmacro
6580
6581IEMIMPL_MEDIA_V_CVTXSS2SI cvttss2si, SSE
6582IEMIMPL_MEDIA_V_CVTXSS2SI vcvttss2si, AVX
6583IEMIMPL_MEDIA_V_CVTXSS2SI cvtss2si, SSE
6584IEMIMPL_MEDIA_V_CVTXSS2SI vcvtss2si, AVX
6585
6586
6587;;
6588; Template for [v]cvttsd2si/[v]cvtsd2si instructions.
6589;
6590; @param 1 Instruction name.
6591; @param 2 AVX or SSE
6592;
6593; @return R0_32 The new MXCSR value of the guest.
6594; @param A0_32 The guest's MXCSR register value to use.
6595; @param A1 Pointer to the result operand (output).
6596; @param A2 Pointer to the second operand (input).
6597;
6598%macro IEMIMPL_MEDIA_V_CVTXSD2SI 2
6599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r64, 16
6600 PROLOGUE_3_ARGS
6601 IEMIMPL_ %+ %2 %+ _PROLOGUE
6602 SSE_AVX_LD_MXCSR A0_32
6603
6604 %1 T0_32, [A2]
6605 mov dword [A1], T0_32
6606
6607 SSE_AVX_ST_MXCSR R0_32, A0_32
6608 IEMIMPL_ %+ %2 %+ _EPILOGUE
6609 EPILOGUE_3_ARGS
6610ENDPROC iemAImpl_ %+ %1 %+ _i32_r64
6611
6612
6613BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r64, 16
6614 PROLOGUE_3_ARGS
6615 IEMIMPL_ %+ %2 %+ _PROLOGUE
6616 SSE_AVX_LD_MXCSR A0_32
6617
6618 %1 T0, [A2]
6619 mov qword [A1], T0
6620
6621 SSE_AVX_ST_MXCSR R0_32, A0_32
6622 IEMIMPL_ %+ %2 %+ _EPILOGUE
6623 EPILOGUE_3_ARGS
6624ENDPROC iemAImpl_ %+ %1 %+ _i64_r64
6625%endmacro
6626
6627IEMIMPL_MEDIA_V_CVTXSD2SI cvttsd2si, SSE
6628IEMIMPL_MEDIA_V_CVTXSD2SI vcvttsd2si, AVX
6629IEMIMPL_MEDIA_V_CVTXSD2SI cvtsd2si, SSE
6630IEMIMPL_MEDIA_V_CVTXSD2SI vcvtsd2si, AVX
6631
6632
6633;;
6634; cvtsi2ss instruction - 32-bit variant.
6635;
6636; @return R0_32 The new MXCSR value of the guest.
6637; @param A0_32 The guest's MXCSR register value to use.
6638; @param A1 Pointer to the result operand (output).
6639; @param A2 Pointer to the second operand (input).
6640;
6641BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6642 PROLOGUE_3_ARGS
6643 IEMIMPL_SSE_PROLOGUE
6644 SSE_AVX_LD_MXCSR A0_32
6645
6646 cvtsi2ss xmm0, dword [A2]
6647 movd dword [A1], xmm0
6648
6649 SSE_AVX_ST_MXCSR R0_32, A0_32
6650 IEMIMPL_SSE_EPILOGUE
6651 EPILOGUE_3_ARGS
6652ENDPROC iemAImpl_cvtsi2ss_r32_i32
6653
6654
6655;;
6656; vcvtsi2ss instruction - 32-bit variant.
6657;
6658; @return R0_32 The new MXCSR value of the guest.
6659; @param A0_32 The guest's MXCSR register value to use.
6660; @param A1 Pointer to the result operand (output).
6661; @param A2 Pointer to the second operand (input).
6662; @param A3 Pointer to the third operand (input).
6663;
6664BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i32, 16
6665 PROLOGUE_3_ARGS
6666 IEMIMPL_AVX_PROLOGUE
6667 SSE_AVX_LD_MXCSR A0_32
6668
6669 movdqu xmm0, [A2]
6670 vcvtsi2ss xmm0, xmm0, dword [A3]
6671 movdqu [A1], xmm0
6672
6673 SSE_AVX_ST_MXCSR R0_32, A0_32
6674 IEMIMPL_AVX_EPILOGUE
6675 EPILOGUE_3_ARGS
6676ENDPROC iemAImpl_vcvtsi2ss_u128_i32
6677
6678
6679;;
6680; cvtsi2ss instruction - 64-bit variant.
6681;
6682; @return R0_32 The new MXCSR value of the guest.
6683; @param A0_32 The guest's MXCSR register value to use.
6684; @param A1 Pointer to the result operand (output).
6685; @param A2 Pointer to the second operand (input).
6686;
6687BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6688 PROLOGUE_3_ARGS
6689 IEMIMPL_SSE_PROLOGUE
6690 SSE_AVX_LD_MXCSR A0_32
6691
6692 cvtsi2ss xmm0, qword [A2]
6693 movd dword [A1], xmm0
6694
6695 SSE_AVX_ST_MXCSR R0_32, A0_32
6696 IEMIMPL_SSE_EPILOGUE
6697 EPILOGUE_3_ARGS
6698ENDPROC iemAImpl_cvtsi2ss_r32_i64
6699
6700
6701;;
6702; vcvtsi2ss instruction - 64-bit variant.
6703;
6704; @return R0_32 The new MXCSR value of the guest.
6705; @param A0_32 The guest's MXCSR register value to use.
6706; @param A1 Pointer to the result operand (output).
6707; @param A2 Pointer to the second operand (input).
6708; @param A3 Pointer to the third operand (input).
6709;
6710BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i64, 16
6711 PROLOGUE_3_ARGS
6712 IEMIMPL_AVX_PROLOGUE
6713 SSE_AVX_LD_MXCSR A0_32
6714
6715 movdqu xmm0, [A2]
6716 vcvtsi2ss xmm0, xmm0, qword [A3]
6717 movdqu [A1], xmm0
6718
6719 SSE_AVX_ST_MXCSR R0_32, A0_32
6720 IEMIMPL_AVX_EPILOGUE
6721 EPILOGUE_3_ARGS
6722ENDPROC iemAImpl_vcvtsi2ss_u128_i64
6723
6724
6725;;
6726; cvtsi2sd instruction - 32-bit variant.
6727;
6728; @return R0_32 The new MXCSR value of the guest.
6729; @param A0_32 The guest's MXCSR register value to use.
6730; @param A1 Pointer to the result operand (output).
6731; @param A2 Pointer to the second operand (input).
6732;
6733BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6734 PROLOGUE_3_ARGS
6735 IEMIMPL_SSE_PROLOGUE
6736 SSE_AVX_LD_MXCSR A0_32
6737
6738 cvtsi2sd xmm0, dword [A2]
6739 movq [A1], xmm0
6740
6741 SSE_AVX_ST_MXCSR R0_32, A0_32
6742 IEMIMPL_SSE_EPILOGUE
6743 EPILOGUE_3_ARGS
6744ENDPROC iemAImpl_cvtsi2sd_r64_i32
6745
6746
6747;;
6748; vcvtsi2sd instruction - 32-bit variant.
6749;
6750; @return R0_32 The new MXCSR value of the guest.
6751; @param A0_32 The guest's MXCSR register value to use.
6752; @param A1 Pointer to the result operand (output).
6753; @param A2 Pointer to the second operand (input).
6754; @param A3 Pointer to the third operand (input).
6755;
6756BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i32, 16
6757 PROLOGUE_3_ARGS
6758 IEMIMPL_AVX_PROLOGUE
6759 SSE_AVX_LD_MXCSR A0_32
6760
6761 movdqu xmm0, [A2]
6762 vcvtsi2sd xmm0, xmm0, dword [A3]
6763 movdqu [A1], xmm0
6764
6765 SSE_AVX_ST_MXCSR R0_32, A0_32
6766 IEMIMPL_AVX_EPILOGUE
6767 EPILOGUE_3_ARGS
6768ENDPROC iemAImpl_vcvtsi2sd_u128_i32
6769
6770
6771;;
6772; cvtsi2sd instruction - 64-bit variant.
6773;
6774; @return R0_32 The new MXCSR value of the guest.
6775; @param A0_32 The guest's MXCSR register value to use.
6776; @param A1 Pointer to the result operand (output).
6777; @param A2 Pointer to the second operand (input).
6778;
6779BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6780 PROLOGUE_3_ARGS
6781 IEMIMPL_SSE_PROLOGUE
6782 SSE_AVX_LD_MXCSR A0_32
6783
6784 cvtsi2sd xmm0, qword [A2]
6785 movq [A1], xmm0
6786
6787 SSE_AVX_ST_MXCSR R0_32, A0_32
6788 IEMIMPL_SSE_EPILOGUE
6789 EPILOGUE_3_ARGS
6790ENDPROC iemAImpl_cvtsi2sd_r64_i64
6791
6792
6793;;
6794; vcvtsi2sd instruction - 64-bit variant.
6795;
6796; @return R0_32 The new MXCSR value of the guest.
6797; @param A0_32 The guest's MXCSR register value to use.
6798; @param A1 Pointer to the result operand (output).
6799; @param A2 Pointer to the second operand (input).
6800; @param A3 Pointer to the third operand (input).
6801;
6802BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i64, 16
6803 PROLOGUE_3_ARGS
6804 IEMIMPL_AVX_PROLOGUE
6805 SSE_AVX_LD_MXCSR A0_32
6806
6807 movdqu xmm0, [A2]
6808 vcvtsi2sd xmm0, xmm0, qword [A3]
6809 movdqu [A1], xmm0
6810
6811 SSE_AVX_ST_MXCSR R0_32, A0_32
6812 IEMIMPL_AVX_EPILOGUE
6813 EPILOGUE_3_ARGS
6814ENDPROC iemAImpl_vcvtsi2sd_u128_i64
6815
6816
6817;
6818; UCOMISS (SSE)
6819;
6820; @return R0_32 The new MXCSR value of the guest.
6821; @param A0_32 The guest's MXCSR register value to use (input).
6822; @param A1 Pointer to the EFLAGS value (input/output).
6823; @param A2_32 The first source operand.
6824; @param A3_32 The second source operand.
6825;
6826BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6827 PROLOGUE_4_ARGS
6828 IEMIMPL_SSE_PROLOGUE
6829 SSE_AVX_LD_MXCSR A0_32
6830
6831 movd xmm0, A2_32
6832 movd xmm1, A3_32
6833 ucomiss xmm0, xmm1
6834 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6835
6836 SSE_AVX_ST_MXCSR R0_32, A0_32
6837 IEMIMPL_SSE_EPILOGUE
6838 EPILOGUE_4_ARGS
6839ENDPROC iemAImpl_ucomiss_u128
6840
6841BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6842 PROLOGUE_4_ARGS
6843 IEMIMPL_SSE_PROLOGUE
6844 SSE_AVX_LD_MXCSR A0_32
6845
6846 movd xmm0, A2_32
6847 movd xmm1, A3_32
6848 vucomiss xmm0, xmm1
6849 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6850
6851 SSE_AVX_ST_MXCSR R0_32, A0_32
6852 IEMIMPL_SSE_EPILOGUE
6853 EPILOGUE_3_ARGS
6854ENDPROC iemAImpl_vucomiss_u128
6855
6856
6857;
6858; UCOMISD (SSE)
6859;
6860; @return R0_32 The new MXCSR value of the guest.
6861; @param A0_32 The guest's MXCSR register value to use (input).
6862; @param A1 Pointer to the EFLAGS value (input/output).
6863; @param A2 The first source operand.
6864; @param A3 The second source operand.
6865;
6866BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6867 PROLOGUE_4_ARGS
6868 IEMIMPL_SSE_PROLOGUE
6869 SSE_AVX_LD_MXCSR A0_32
6870
6871 movq xmm0, A2
6872 movq xmm1, A3
6873 ucomisd xmm0, xmm1
6874 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6875
6876 SSE_AVX_ST_MXCSR R0_32, A0_32
6877 IEMIMPL_SSE_EPILOGUE
6878 EPILOGUE_4_ARGS
6879ENDPROC iemAImpl_ucomisd_u128
6880
6881BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6882 PROLOGUE_4_ARGS
6883 IEMIMPL_SSE_PROLOGUE
6884 SSE_AVX_LD_MXCSR A0_32
6885
6886 movq xmm0, A2
6887 movq xmm1, A3
6888 vucomisd xmm0, xmm1
6889 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6890
6891 SSE_AVX_ST_MXCSR R0_32, A0_32
6892 IEMIMPL_SSE_EPILOGUE
6893 EPILOGUE_4_ARGS
6894ENDPROC iemAImpl_vucomisd_u128
6895
6896;
6897; COMISS (SSE)
6898;
6899; @return R0_32 The new MXCSR value of the guest.
6900; @param A0_32 The guest's MXCSR register value to use (input).
6901; @param A1 Pointer to the EFLAGS value (input/output).
6902; @param A2_32 The first source operand.
6903; @param A3_32 The second source operand.
6904;
6905BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6906 PROLOGUE_4_ARGS
6907 IEMIMPL_SSE_PROLOGUE
6908 SSE_AVX_LD_MXCSR A0_32
6909
6910 movd xmm0, A2_32
6911 movd xmm1, A3_32
6912 comiss xmm0, xmm1
6913 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6914
6915 SSE_AVX_ST_MXCSR R0_32, A0_32
6916 IEMIMPL_SSE_EPILOGUE
6917 EPILOGUE_4_ARGS
6918ENDPROC iemAImpl_comiss_u128
6919
6920BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6921 PROLOGUE_4_ARGS
6922 IEMIMPL_SSE_PROLOGUE
6923 SSE_AVX_LD_MXCSR A0_32
6924
6925 movd xmm0, A2_32
6926 movd xmm1, A3_32
6927 vcomiss xmm0, xmm1
6928 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6929
6930 SSE_AVX_ST_MXCSR R0_32, A0_32
6931 IEMIMPL_SSE_EPILOGUE
6932 EPILOGUE_4_ARGS
6933ENDPROC iemAImpl_vcomiss_u128
6934
6935
6936;
6937; COMISD (SSE)
6938;
6939; @return R0_32 The new MXCSR value of the guest.
6940; @param A0_32 The guest's MXCSR register value to use (input).
6941; @param A1 Pointer to the EFLAGS value (input/output).
6942; @param A2 The first source operand.
6943; @param A3 The second source operand.
6944;
6945BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6946 PROLOGUE_4_ARGS
6947 IEMIMPL_SSE_PROLOGUE
6948 SSE_AVX_LD_MXCSR A0_32
6949
6950 movq xmm0, A2
6951 movq xmm1, A3
6952 comisd xmm0, xmm1
6953 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6954
6955 SSE_AVX_ST_MXCSR R0_32, A0_32
6956 IEMIMPL_SSE_EPILOGUE
6957 EPILOGUE_4_ARGS
6958ENDPROC iemAImpl_comisd_u128
6959
6960BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6961 PROLOGUE_4_ARGS
6962 IEMIMPL_SSE_PROLOGUE
6963 SSE_AVX_LD_MXCSR A0_32
6964
6965 movq xmm0, A2
6966 movq xmm1, A3
6967 vcomisd xmm0, xmm1
6968 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6969
6970 SSE_AVX_ST_MXCSR R0_32, A0_32
6971 IEMIMPL_SSE_EPILOGUE
6972 EPILOGUE_4_ARGS
6973ENDPROC iemAImpl_vcomisd_u128
6974
6975
6976;;
6977; Need to move this as well somewhere better?
6978;
6979struc IEMMEDIAF2XMMSRC
6980 .uSrc1 resd 4
6981 .uSrc2 resd 4
6982endstruc
6983
6984
6985struc IEMMEDIAF2YMMSRC
6986 .uSrc1 resd 8
6987 .uSrc2 resd 8
6988endstruc
6989
6990
6991;;
6992; SSE/AVX instructions with 8-bit immediates of the form
6993; xxx xmm1, xmm2, imm8.
6994; vxxx xmm1, xmm2, xmm3, imm8.
6995; and we need to load and save the MXCSR register.
6996;
6997; @param 1 The instruction name.
6998; @param 2 Flag whether this instruction has a 256-bit AVX variant (1) or not (0).
6999; @param 3 Number of bytes for the encoding of the SSE variant + ret instruction (AVX is fixed to 6).
7000;
7001; @return R0_32 The new MXCSR value of the guest.
7002; @param A0_32 The guest's MXCSR register value to use (input).
7003; @param A1 Pointer to the first media register size operand (output).
7004; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
7005; @param A3 The 8-bit immediate (input).
7006;
7007%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR 3
7008BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7009 PROLOGUE_4_ARGS
7010 IEMIMPL_SSE_PROLOGUE
7011 SSE_AVX_LD_MXCSR A0_32
7012
7013 movzx A3, A3_8 ; must clear top bits
7014 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7015 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7016 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, %3
7017 movdqu [A1], xmm0
7018
7019 SSE_AVX_ST_MXCSR R0_32, A0_32
7020 IEMIMPL_SSE_EPILOGUE
7021 EPILOGUE_4_ARGS
7022 %assign bImm 0
7023 %rep 256
7024.imm %+ bImm:
7025 IBT_ENDBRxx_WITHOUT_NOTRACK
7026 %1 xmm0, xmm1, bImm
7027 ret
7028 %assign bImm bImm + 1
7029 %endrep
7030.immEnd:
7031ENDPROC iemAImpl_ %+ %1 %+ _u128
7032
7033
7034BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7035 PROLOGUE_4_ARGS
7036 IEMIMPL_SSE_PROLOGUE
7037 SSE_AVX_LD_MXCSR A0_32
7038
7039 movzx A3, A3_8 ; must clear top bits
7040 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7041 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7042 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7043 movdqu [A1], xmm0
7044
7045 SSE_AVX_ST_MXCSR R0_32, A0_32
7046 IEMIMPL_SSE_EPILOGUE
7047 EPILOGUE_4_ARGS
7048 %assign bImm 0
7049 %rep 256
7050.imm %+ bImm:
7051 IBT_ENDBRxx_WITHOUT_NOTRACK
7052 v %+ %1 xmm0, xmm0, xmm1, bImm
7053 ret
7054 %assign bImm bImm + 1
7055 %endrep
7056.immEnd:
7057ENDPROC iemAImpl_v %+ %1 %+ _u128
7058
7059 %if %2 == 1
7060BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7061 PROLOGUE_4_ARGS
7062 IEMIMPL_SSE_PROLOGUE
7063 SSE_AVX_LD_MXCSR A0_32
7064
7065 movzx A3, A3_8 ; must clear top bits
7066 vmovdqu ymm0, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7067 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7068 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7069 vmovdqu [A1], ymm0
7070
7071 SSE_AVX_ST_MXCSR R0_32, A0_32
7072 IEMIMPL_SSE_EPILOGUE
7073 EPILOGUE_4_ARGS
7074 %assign bImm 0
7075 %rep 256
7076.imm %+ bImm:
7077 IBT_ENDBRxx_WITHOUT_NOTRACK
7078 v %+ %1 ymm0, ymm0, ymm1, bImm
7079 ret
7080 %assign bImm bImm + 1
7081 %endrep
7082.immEnd:
7083ENDPROC iemAImpl_v %+ %1 %+ _u256
7084 %endif
7085%endmacro
7086
7087IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpps, 1, 5
7088IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmppd, 1, 6
7089IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpss, 0, 6
7090IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpsd, 0, 6
7091
7092
7093;;
7094; SSE/AVX instructions with 2 full sized perands and an 8-bit immediate of the form
7095; xxx xmm1, xmm2, imm8.
7096; vxxx xmm1, xmm2, imm8
7097; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7098; register.
7099;
7100; @param 1 The instruction name.
7101;
7102; @return R0_32 The new MXCSR value of the guest.
7103; @param A0_32 The guest's MXCSR register value to use (input).
7104; @param A1 Pointer to the first media register size operand (output).
7105; @param A2 Pointer to the second media register size operand (input).
7106; @param A3 The 8-bit immediate (input).
7107;
7108%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 1
7109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7110 PROLOGUE_4_ARGS
7111 IEMIMPL_SSE_PROLOGUE
7112 SSE_AVX_LD_MXCSR A0_32
7113
7114 movzx A3, A3_8 ; must clear top bits
7115 movdqu xmm1, [A2]
7116 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7117 movdqu [A1], xmm0
7118
7119 SSE_AVX_ST_MXCSR R0_32, A0_32
7120 IEMIMPL_SSE_EPILOGUE
7121 EPILOGUE_4_ARGS
7122 %assign bImm 0
7123 %rep 256
7124.imm %+ bImm:
7125 IBT_ENDBRxx_WITHOUT_NOTRACK
7126 %1 xmm0, xmm1, bImm
7127 ret
7128 int3
7129 %assign bImm bImm + 1
7130 %endrep
7131.immEnd:
7132ENDPROC iemAImpl_ %+ %1 %+ _u128
7133
7134BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7135 PROLOGUE_4_ARGS
7136 IEMIMPL_SSE_PROLOGUE
7137 SSE_AVX_LD_MXCSR A0_32
7138
7139 movzx A3, A3_8 ; must clear top bits
7140 movdqu xmm1, [A2]
7141 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7142 movdqu [A1], xmm0
7143
7144 SSE_AVX_ST_MXCSR R0_32, A0_32
7145 IEMIMPL_SSE_EPILOGUE
7146 EPILOGUE_4_ARGS
7147 %assign bImm 0
7148 %rep 256
7149.imm %+ bImm:
7150 IBT_ENDBRxx_WITHOUT_NOTRACK
7151 v%1 xmm0, xmm1, bImm
7152 ret
7153 int3
7154 %assign bImm bImm + 1
7155 %endrep
7156.immEnd:
7157ENDPROC iemAImpl_v %+ %1 %+ _u128
7158
7159BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7160 PROLOGUE_4_ARGS
7161 IEMIMPL_SSE_PROLOGUE
7162 SSE_AVX_LD_MXCSR A0_32
7163
7164 movzx A3, A3_8 ; must clear top bits
7165 vmovdqu ymm1, [A2]
7166 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7167 vmovdqu [A1], ymm0
7168
7169 SSE_AVX_ST_MXCSR R0_32, A0_32
7170 IEMIMPL_SSE_EPILOGUE
7171 EPILOGUE_4_ARGS
7172 %assign bImm 0
7173 %rep 256
7174.imm %+ bImm:
7175 IBT_ENDBRxx_WITHOUT_NOTRACK
7176 v%1 ymm0, ymm1, bImm
7177 ret
7178 int3
7179 %assign bImm bImm + 1
7180 %endrep
7181.immEnd:
7182ENDPROC iemAImpl_v %+ %1 %+ _u256
7183%endmacro
7184
7185IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundps
7186IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundpd
7187
7188
7189;;
7190; SSE/AVX instructions with 3 full sized perands and an 8-bit immediate of the form
7191; xxx xmm1, xmm2, imm8.
7192; vxxx xmm1, xmm2, xmm3, imm8
7193; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7194; register.
7195;
7196; @param 1 The instruction name.
7197; @param 2 Flag whether to emit a 256-bit AVX variant (1) or not (0).
7198;
7199; @return R0_32 The new MXCSR value of the guest.
7200; @param A0_32 The guest's MXCSR register value to use (input).
7201; @param A1 Pointer to the first media register size operand (output).
7202; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC/IEMMEDIAF2YMMSRC (input).
7203; @param A3 The 8-bit immediate (input).
7204;
7205%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 2
7206BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7207 PROLOGUE_4_ARGS
7208 IEMIMPL_SSE_PROLOGUE
7209 SSE_AVX_LD_MXCSR A0_32
7210
7211 movzx A3, A3_8 ; must clear top bits
7212 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7213 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7214 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7215 movdqu [A1], xmm0
7216
7217 SSE_AVX_ST_MXCSR R0_32, A0_32
7218 IEMIMPL_SSE_EPILOGUE
7219 EPILOGUE_4_ARGS
7220 %assign bImm 0
7221 %rep 256
7222.imm %+ bImm:
7223 IBT_ENDBRxx_WITHOUT_NOTRACK
7224 %1 xmm0, xmm1, bImm
7225 ret
7226 int3
7227 %assign bImm bImm + 1
7228 %endrep
7229.immEnd:
7230ENDPROC iemAImpl_ %+ %1 %+ _u128
7231
7232
7233BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7234 PROLOGUE_4_ARGS
7235 IEMIMPL_SSE_PROLOGUE
7236 SSE_AVX_LD_MXCSR A0_32
7237
7238 movzx A3, A3_8 ; must clear top bits
7239 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7240 movdqu xmm2, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7241 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7242 movdqu [A1], xmm0
7243
7244 SSE_AVX_ST_MXCSR R0_32, A0_32
7245 IEMIMPL_SSE_EPILOGUE
7246 EPILOGUE_4_ARGS
7247 %assign bImm 0
7248 %rep 256
7249.imm %+ bImm:
7250 IBT_ENDBRxx_WITHOUT_NOTRACK
7251 v %+ %1 xmm0, xmm1, xmm2, bImm
7252 ret
7253 int3
7254 %assign bImm bImm + 1
7255 %endrep
7256.immEnd:
7257ENDPROC iemAImpl_v %+ %1 %+ _u128
7258
7259
7260 %if %2 == 1
7261BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7262 PROLOGUE_4_ARGS
7263 IEMIMPL_SSE_PROLOGUE
7264 SSE_AVX_LD_MXCSR A0_32
7265
7266 movzx A3, A3_8 ; must clear top bits
7267 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7268 vmovdqu ymm2, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7269 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7270 vmovdqu [A1], ymm0
7271
7272 SSE_AVX_ST_MXCSR R0_32, A0_32
7273 IEMIMPL_SSE_EPILOGUE
7274 EPILOGUE_4_ARGS
7275 %assign bImm 0
7276 %rep 256
7277.imm %+ bImm:
7278 IBT_ENDBRxx_WITHOUT_NOTRACK
7279 v %+ %1 ymm0, ymm1, ymm2, bImm
7280 ret
7281 int3
7282 %assign bImm bImm + 1
7283 %endrep
7284.immEnd:
7285ENDPROC iemAImpl_v %+ %1 %+ _u256
7286 %endif
7287%endmacro
7288
7289IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundss, 0
7290IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundsd, 0
7291IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dpps, 1
7292IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dppd, 0
7293
7294
7295;;
7296; SSE instructions of the form
7297; xxx mm, xmm.
7298; and we need to load and save the MXCSR register.
7299;
7300; @param 1 The instruction name.
7301;
7302; @return R0_32 The new MXCSR value of the guest.
7303; @param A0_32 The guest's MXCSR register value to use (input).
7304; @param A1 Pointer to the first MMX register sized operand (output).
7305; @param A2 Pointer to the media register sized operand (input).
7306;
7307%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
7308BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7309 PROLOGUE_3_ARGS
7310 IEMIMPL_SSE_PROLOGUE
7311 SSE_AVX_LD_MXCSR A0_32
7312
7313 movdqu xmm0, [A2]
7314 %1 mm0, xmm0
7315 movq [A1], mm0
7316
7317 SSE_AVX_ST_MXCSR R0_32, A0_32
7318 IEMIMPL_SSE_EPILOGUE
7319 EPILOGUE_3_ARGS
7320ENDPROC iemAImpl_ %+ %1 %+ _u128
7321%endmacro
7322
7323IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
7324IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
7325
7326;;
7327; SSE instructions of the form
7328; xxx xmm, xmm/m64.
7329; and we need to load and save the MXCSR register.
7330;
7331; @param 1 The instruction name.
7332;
7333; @return R0_32 The new MXCSR value of the guest.
7334; @param A0_32 The guest's MXCSR register value to use (input).
7335; @param A1 Pointer to the first media register sized operand (input/output).
7336; @param A2 The 64bit source value from a MMX media register (input)
7337;
7338%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
7339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7340 PROLOGUE_3_ARGS
7341 IEMIMPL_SSE_PROLOGUE
7342 SSE_AVX_LD_MXCSR A0_32
7343
7344 movdqu xmm0, [A1]
7345 movq mm0, A2
7346 %1 xmm0, mm0
7347 movdqu [A1], xmm0
7348
7349 SSE_AVX_ST_MXCSR R0_32, A0_32
7350 IEMIMPL_SSE_EPILOGUE
7351 EPILOGUE_3_ARGS
7352ENDPROC iemAImpl_ %+ %1 %+ _u128
7353%endmacro
7354
7355IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
7356IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
7357
7358;;
7359; SSE instructions of the form
7360; xxx mm, xmm/m64.
7361; and we need to load and save the MXCSR register.
7362;
7363; @param 1 The instruction name.
7364;
7365; @return R0_32 The new MXCSR value of the guest.
7366; @param A0_32 The guest's MXCSR register value to use (input).
7367; @param A1 Pointer to the first MMX media register sized operand (output).
7368; @param A2 The 64bit source value (input).
7369;
7370%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
7371BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7372 PROLOGUE_3_ARGS
7373 IEMIMPL_SSE_PROLOGUE
7374 SSE_AVX_LD_MXCSR A0_32
7375
7376 movq xmm0, A2
7377 %1 mm0, xmm0
7378 movq [A1], mm0
7379
7380 SSE_AVX_ST_MXCSR R0_32, A0_32
7381 IEMIMPL_SSE_EPILOGUE
7382 EPILOGUE_3_ARGS
7383ENDPROC iemAImpl_ %+ %1 %+ _u128
7384%endmacro
7385
7386IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
7387IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
7388
7389;
7390; All forms of RDRAND and RDSEED
7391;
7392; @param A0 Pointer to the destination operand.
7393; @param A1 Pointer to the EFLAGS value (input/output).
7394;
7395%macro IEMIMPL_RDRAND_RDSEED 3
7396BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
7397 PROLOGUE_2_ARGS
7398
7399 %1 %2
7400 mov [A0], %2
7401 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
7402
7403 EPILOGUE_2_ARGS
7404ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
7405%endmacro
7406
7407IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
7408IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
7409IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
7410IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
7411IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
7412IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
7413
7414
7415;;
7416; sha1rnds4 xmm1, xmm2, imm8.
7417;
7418; @param 1 The instruction name.
7419;
7420; @param A0 Pointer to the first media register size operand (input/output).
7421; @param A1 Pointer to the second source media register size operand (input).
7422; @param A2 The 8-bit immediate
7423;
7424BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
7425 PROLOGUE_3_ARGS
7426 IEMIMPL_SSE_PROLOGUE
7427
7428 movzx A2, A2_8 ; must clear top bits
7429 movdqu xmm0, [A0]
7430 movdqu xmm1, [A1]
7431 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
7432 movdqu [A0], xmm0
7433
7434 IEMIMPL_SSE_EPILOGUE
7435 EPILOGUE_3_ARGS
7436 %assign bImm 0
7437 %rep 256
7438.imm %+ bImm:
7439 IBT_ENDBRxx_WITHOUT_NOTRACK
7440 sha1rnds4 xmm0, xmm1, bImm
7441 ret
7442 %assign bImm bImm + 1
7443 %endrep
7444.immEnd:
7445ENDPROC iemAImpl_sha1rnds4_u128
7446
7447
7448;;
7449; sha256rnds2 xmm1, xmm2, <XMM0>.
7450;
7451; @param 1 The instruction name.
7452;
7453; @param A0 Pointer to the first media register size operand (input/output).
7454; @param A1 Pointer to the second source media register size operand (input).
7455; @param A2 Pointer to the implicit XMM0 constants (input).
7456;
7457BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
7458 PROLOGUE_3_ARGS
7459 IEMIMPL_SSE_PROLOGUE
7460
7461 movdqu xmm0, [A2]
7462 movdqu xmm1, [A0]
7463 movdqu xmm2, [A1]
7464 sha256rnds2 xmm1, xmm2
7465 movdqu [A0], xmm1
7466
7467 IEMIMPL_SSE_EPILOGUE
7468 EPILOGUE_3_ARGS
7469ENDPROC iemAImpl_sha256rnds2_u128
7470
7471
7472;
7473; 32-bit forms of ADCX and ADOX
7474;
7475; @returns Updated EFLAGS.
7476; @param A0 Incoming EFLAGS value (input).
7477; @param A1 Pointer to the destination operand (input/output).
7478; @param A2 32-bit source operand 1 (input).
7479;
7480%macro IEMIMPL_ADX_32 2
7481BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
7482 PROLOGUE_4_ARGS
7483
7484 IEM_LOAD_FLAGS A0_32, %2, 0
7485 %1 A2_32, [A1]
7486 mov [A1], A2_32
7487 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7488
7489 EPILOGUE_4_ARGS
7490ENDPROC iemAImpl_ %+ %1 %+ _u32
7491%endmacro
7492
7493;
7494; 64-bit forms of ADCX and ADOX
7495;
7496; @returns Updated EFLAGS.
7497; @param A0 Incoming EFLAGS value (input).
7498; @param A1 Pointer to the destination operand (input/output).
7499; @param A2 64-bit source operand 1 (input).
7500;
7501%macro IEMIMPL_ADX_64 2
7502BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
7503 PROLOGUE_4_ARGS
7504
7505 IEM_LOAD_FLAGS A0_32, %2, 0
7506 %1 A2, [A1]
7507 mov [A1], A2
7508 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7509
7510 EPILOGUE_4_ARGS
7511ENDPROC iemAImpl_ %+ %1 %+ _u64
7512%endmacro
7513
7514IEMIMPL_ADX_32 adcx, X86_EFL_CF
7515IEMIMPL_ADX_64 adcx, X86_EFL_CF
7516
7517IEMIMPL_ADX_32 adox, X86_EFL_OF
7518IEMIMPL_ADX_64 adox, X86_EFL_OF
7519
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette