VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 104269

Last change on this file since 104269 was 104269, checked in by vboxsync, 8 months ago

VMM/IEM: Rework pcmpistri emulation to pass the new ECX value as return argument freeing up one argument which can be used to pass both source operands by reference getting rid of IEMPCMPISTRXSRC for this. This enables recompilation of pcmpistri which is used by Linux a fair bit, bugref:10641

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 205.3 KB
Line 
1; $Id: IEMAllAImpl.asm 104269 2024-04-10 09:42:20Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A1,
1086; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1087; are returned in EAX.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092; @param 4 The zeroed flags.
1093;
1094%macro IEMIMPL_VEX_BIN_OP_2 4
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1096 PROLOGUE_4_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1098 mov T0_32, [A1]
1099 %1 T0_32, A2_32
1100 mov [A1], T0_32
1101 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1102 EPILOGUE_4_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u32
1104
1105 %ifdef RT_ARCH_AMD64
1106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1107 PROLOGUE_4_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1109 mov T0, [A1]
1110 %1 T0, A2
1111 mov [A1], T0
1112 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1113 EPILOGUE_4_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u64
1115 %endif ; RT_ARCH_AMD64
1116%endmacro
1117
1118; instr, modified-flags, undefined-flags zeroed-flags
1119IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122
1123
1124;;
1125; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1126;
1127; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1128; where the 64-bit accesses requires hand coding.
1129;
1130; All the functions takes a pointer to the destination memory operand in A0,
1131; the first source register operand in A1, the second source register operand
1132; in A2 and a pointer to eflags in A3.
1133;
1134; @param 1 The instruction mnemonic.
1135; @param 2 Fallback instruction if applicable.
1136; @param 3 Whether to emit fallback or not.
1137;
1138%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140 PROLOGUE_3_ARGS
1141 %1 T0_32, A1_32, A2_32
1142 mov [A0], T0_32
1143 EPILOGUE_3_ARGS
1144ENDPROC iemAImpl_ %+ %1 %+ _u32
1145
1146 %if %3
1147BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1148 PROLOGUE_3_ARGS
1149 %ifdef ASM_CALL64_GCC
1150 mov cl, A2_8
1151 %2 A1_32, cl
1152 mov [A0], A1_32
1153 %else
1154 xchg A2, A0
1155 %2 A1_32, cl
1156 mov [A2], A1_32
1157 %endif
1158 EPILOGUE_3_ARGS
1159ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1160 %endif
1161
1162 %ifdef RT_ARCH_AMD64
1163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1164 PROLOGUE_3_ARGS
1165 %1 T0, A1, A2
1166 mov [A0], T0
1167 EPILOGUE_3_ARGS
1168ENDPROC iemAImpl_ %+ %1 %+ _u64
1169
1170 %if %3
1171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1172 PROLOGUE_3_ARGS
1173 %ifdef ASM_CALL64_GCC
1174 mov cl, A2_8
1175 %2 A1, cl
1176 mov [A0], A1_32
1177 %else
1178 xchg A2, A0
1179 %2 A1, cl
1180 mov [A2], A1_32
1181 %endif
1182 mov [A0], A1
1183 EPILOGUE_3_ARGS
1184ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1185 %endif
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188
1189; instr, fallback instr, emit fallback
1190IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1194IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1195
1196
1197;
1198; RORX uses a immediate byte for the shift count, so we only do
1199; fallback implementation of that one.
1200;
1201BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1202 PROLOGUE_3_ARGS
1203 %ifdef ASM_CALL64_GCC
1204 mov cl, A2_8
1205 ror A1_32, cl
1206 mov [A0], A1_32
1207 %else
1208 xchg A2, A0
1209 ror A1_32, cl
1210 mov [A2], A1_32
1211 %endif
1212 EPILOGUE_3_ARGS
1213ENDPROC iemAImpl_rorx_u32
1214
1215 %ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1217 PROLOGUE_3_ARGS
1218 %ifdef ASM_CALL64_GCC
1219 mov cl, A2_8
1220 ror A1, cl
1221 mov [A0], A1
1222 %else
1223 xchg A2, A0
1224 ror A1, cl
1225 mov [A2], A1
1226 %endif
1227 EPILOGUE_3_ARGS
1228ENDPROC iemAImpl_rorx_u64
1229 %endif ; RT_ARCH_AMD64
1230
1231
1232;
1233; MULX
1234;
1235BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1236 PROLOGUE_4_ARGS
1237%ifdef ASM_CALL64_GCC
1238 ; A2_32 is EDX - prefect
1239 mulx T0_32, T1_32, A3_32
1240 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1241 mov [A0], T0_32
1242%else
1243 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1244 xchg A1, A2
1245 mulx T0_32, T1_32, A3_32
1246 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1247 mov [A0], T0_32
1248%endif
1249 EPILOGUE_4_ARGS
1250ENDPROC iemAImpl_mulx_u32
1251
1252
1253BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1254 PROLOGUE_4_ARGS
1255%ifdef ASM_CALL64_GCC
1256 ; A2_32 is EDX, T0_32 is EAX
1257 mov eax, A3_32
1258 mul A2_32
1259 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1260 mov [A0], edx
1261%else
1262 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1263 xchg A1, A2
1264 mov eax, A3_32
1265 mul A2_32
1266 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1267 mov [A0], edx
1268%endif
1269 EPILOGUE_4_ARGS
1270ENDPROC iemAImpl_mulx_u32_fallback
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1274 PROLOGUE_4_ARGS
1275%ifdef ASM_CALL64_GCC
1276 ; A2 is RDX - prefect
1277 mulx T0, T1, A3
1278 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1279 mov [A0], T0
1280%else
1281 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1282 xchg A1, A2
1283 mulx T0, T1, A3
1284 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1285 mov [A0], T0
1286%endif
1287 EPILOGUE_4_ARGS
1288ENDPROC iemAImpl_mulx_u64
1289
1290
1291BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1292 PROLOGUE_4_ARGS
1293%ifdef ASM_CALL64_GCC
1294 ; A2 is RDX, T0 is RAX
1295 mov rax, A3
1296 mul A2
1297 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1298 mov [A0], rdx
1299%else
1300 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1301 xchg A1, A2
1302 mov rax, A3
1303 mul A2
1304 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1305 mov [A0], rdx
1306%endif
1307 EPILOGUE_4_ARGS
1308ENDPROC iemAImpl_mulx_u64_fallback
1309
1310%endif
1311
1312
1313;;
1314; Macro for implementing a bit operator.
1315;
1316; This will generate code for the 16, 32 and 64 bit accesses with locked
1317; variants, except on 32-bit system where the 64-bit accesses requires hand
1318; coding.
1319;
1320; All the functions takes a pointer to the destination memory operand in A1,
1321; the source register operand in A2 and incoming eflags in A0.
1322;
1323; @param 1 The instruction mnemonic.
1324; @param 2 Non-zero if there should be a locked version.
1325; @param 3 The modified flags.
1326; @param 4 The undefined flags.
1327;
1328%macro IEMIMPL_BIT_OP 4
1329BEGINCODE
1330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1333 %1 word [A1], A2_16
1334 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_ %+ %1 %+ _u16
1337
1338BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1341 %1 dword [A1], A2_32
1342 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_ %+ %1 %+ _u32
1345
1346 %ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1350 %1 qword [A1], A2
1351 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1352 EPILOGUE_3_ARGS_EX 8
1353ENDPROC iemAImpl_ %+ %1 %+ _u64
1354 %endif ; RT_ARCH_AMD64
1355
1356 %if %2 != 0 ; locked versions requested?
1357
1358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1359 PROLOGUE_3_ARGS
1360 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1361 lock %1 word [A1], A2_16
1362 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1365
1366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1367 PROLOGUE_3_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1369 lock %1 dword [A1], A2_32
1370 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1371 EPILOGUE_3_ARGS
1372ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1373
1374 %ifdef RT_ARCH_AMD64
1375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1376 PROLOGUE_3_ARGS
1377 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1378 lock %1 qword [A1], A2
1379 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1380 EPILOGUE_3_ARGS_EX 8
1381ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1382 %endif ; RT_ARCH_AMD64
1383 %endif ; locked
1384%endmacro
1385
1386; Undefined flags are passed thru here by the intel and amd CPUs we have.
1387; modified efl, undefined eflags
1388IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392
1393;;
1394; Macro for implementing a bit search operator.
1395;
1396; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1397; system where the 64-bit accesses requires hand coding.
1398;
1399; All the functions takes a pointer to the destination memory operand in A1,
1400; the source register operand in A2 and the incoming eflags in A0.
1401;
1402; In the ZF case the destination register is 'undefined', however it seems that
1403; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1404; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1405; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1406; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1407;
1408; Intel: Clear all and calculate PF in addition to ZF.
1409; AMD: Passthru all flags other than ZF.
1410;
1411; @param 1 The instruction mnemonic.
1412; @param 2 The modified flags.
1413; @param 3 The undefined flags.
1414; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1415;
1416%macro IEMIMPL_BIT_OP2 4
1417BEGINCODE
1418; 16-bit
1419
1420BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1421 PROLOGUE_3_ARGS
1422 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1423 %1 T0_16, A2_16
1424%if %4 != 0
1425 jz .unchanged_dst
1426%endif
1427 mov [A1], T0_16
1428.unchanged_dst:
1429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1430 EPILOGUE_3_ARGS
1431ENDPROC iemAImpl_ %+ %1 %+ _u16
1432
1433;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1434;bad; PROLOGUE_3_ARGS
1435;bad; %1 T1_16, A1_16
1436;bad; jz .unchanged_dst
1437;bad; mov [A0], T1_16
1438;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1439;bad; EPILOGUE_3_ARGS
1440;bad;.unchanged_dst:
1441;bad;%if %4 != 0
1442;bad; mov [A0], T1_16
1443;bad;%endif
1444;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1445;bad; EPILOGUE_3_ARGS
1446;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1447;bad;
1448;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1449;bad; PROLOGUE_3_ARGS
1450;bad; %1 T0_16, A1_16
1451;bad;%if %4 != 0
1452;bad; jz .unchanged_dst
1453;bad;%endif
1454;bad; mov [A0], T0_16
1455;bad;.unchanged_dst:
1456;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1457;bad; EPILOGUE_3_ARGS
1458;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1459
1460; 32-bit
1461
1462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1463 PROLOGUE_3_ARGS
1464 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1465 %1 T0_32, A2_32
1466%if %4 != 0
1467 jz .unchanged_dst
1468%endif
1469 mov [A1], T0_32
1470.unchanged_dst:
1471 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1472 EPILOGUE_3_ARGS
1473ENDPROC iemAImpl_ %+ %1 %+ _u32
1474
1475;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1476;bad; PROLOGUE_3_ARGS
1477;bad; %1 T1_32, A1_32
1478;bad;%if %4 != 0
1479;bad; jz .unchanged_dst
1480;bad;%endif
1481;bad; mov [A0], T1_32
1482;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1483;bad; EPILOGUE_3_ARGS
1484;bad;.unchanged_dst:
1485;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1486;bad; EPILOGUE_3_ARGS
1487;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1488;bad;
1489;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1490;bad; PROLOGUE_3_ARGS
1491;bad; %1 T0_32, A1_32
1492;bad;%if %4 != 0
1493;bad; jz .unchanged_dst
1494;bad;%endif
1495;bad; mov [A0], T0_32
1496;bad;.unchanged_dst:
1497;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1498;bad; EPILOGUE_3_ARGS
1499;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1500
1501
1502 %ifdef RT_ARCH_AMD64
1503; 64-bit
1504
1505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1506 PROLOGUE_3_ARGS
1507 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1508 %1 T0, A2
1509%if %4 != 0
1510 jz .unchanged_dst
1511%endif
1512 mov [A1], T0
1513.unchanged_dst:
1514 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1515 EPILOGUE_3_ARGS_EX 8
1516ENDPROC iemAImpl_ %+ %1 %+ _u64
1517
1518;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1519;bad; PROLOGUE_3_ARGS
1520;bad; %1 T1, A1
1521;bad;%if %4 != 0
1522;bad; jz .unchanged_dst
1523;bad;%endif
1524;bad; mov [A0], T1
1525;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1526;bad; EPILOGUE_3_ARGS
1527;bad;.unchanged_dst:
1528;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1529;bad; EPILOGUE_3_ARGS
1530;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1531;bad;
1532;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1533;bad; PROLOGUE_3_ARGS
1534;bad; %1 T0, A1
1535;bad;%if %4 != 0
1536;bad; jz .unchanged_dst
1537;bad;%endif
1538;bad; mov [A0], T0
1539;bad;.unchanged_dst:
1540;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1541;bad; EPILOGUE_3_ARGS_EX 8
1542;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1543
1544 %endif ; RT_ARCH_AMD64
1545%endmacro
1546
1547IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551
1552
1553;;
1554; Macro for implementing POPCNT.
1555;
1556; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1557; system where the 64-bit accesses requires hand coding.
1558;
1559; All the functions takes a pointer to the destination memory operand in A1,
1560; the source register operand in A2 and eflags in A0.
1561;
1562; ASSUMES Intel and AMD set EFLAGS the same way.
1563;
1564; ASSUMES the instruction does not support memory destination.
1565;
1566; @param 1 The instruction mnemonic.
1567; @param 2 The modified flags.
1568; @param 3 The undefined flags.
1569; @param 4 The zeroed flags.
1570;
1571%macro IEMIMPL_BIT_OP3 4
1572BEGINCODE
1573BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1574 PROLOGUE_3_ARGS
1575 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1576 %1 T0_16, A2_16
1577 mov [A1], T0_16
1578 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1579 EPILOGUE_3_ARGS
1580ENDPROC iemAImpl_ %+ %1 %+ _u16
1581
1582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1583 PROLOGUE_3_ARGS
1584 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1585 %1 T0_32, A2_32
1586 mov [A1], T0_32
1587 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1588 EPILOGUE_3_ARGS
1589ENDPROC iemAImpl_ %+ %1 %+ _u32
1590
1591 %ifdef RT_ARCH_AMD64
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1595 %1 T0, A2
1596 mov [A1], T0
1597 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1598 EPILOGUE_3_ARGS_EX 8
1599ENDPROC iemAImpl_ %+ %1 %+ _u64
1600 %endif ; RT_ARCH_AMD64
1601%endmacro
1602IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1603
1604
1605;
1606; IMUL is also a similar but yet different case (no lock, no mem dst).
1607; The rDX:rAX variant of imul is handled together with mul further down.
1608;
1609BEGINCODE
1610; @param 1 EFLAGS that are modified.
1611; @param 2 Undefined EFLAGS.
1612; @param 3 Function suffix.
1613; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1614; 2 for AMD (set AF, clear PF, ZF and SF).
1615%macro IEMIMPL_IMUL_TWO 4
1616BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1617 PROLOGUE_3_ARGS
1618 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1619 imul A2_16, word [A1]
1620 mov [A1], A2_16
1621 %if %4 != 1
1622 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1623 %else
1624 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1625 %endif
1626 EPILOGUE_3_ARGS
1627ENDPROC iemAImpl_imul_two_u16 %+ %3
1628
1629BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1630 PROLOGUE_3_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1632 imul A2_32, dword [A1]
1633 mov [A1], A2_32
1634 %if %4 != 1
1635 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1636 %else
1637 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1638 %endif
1639 EPILOGUE_3_ARGS
1640ENDPROC iemAImpl_imul_two_u32 %+ %3
1641
1642 %ifdef RT_ARCH_AMD64
1643BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1644 PROLOGUE_3_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1646 imul A2, qword [A1]
1647 mov [A1], A2
1648 %if %4 != 1
1649 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1650 %else
1651 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1652 %endif
1653 EPILOGUE_3_ARGS_EX 8
1654ENDPROC iemAImpl_imul_two_u64 %+ %3
1655 %endif ; RT_ARCH_AMD64
1656%endmacro
1657; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1658; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1659; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1663
1664
1665;
1666; XCHG for memory operands. This implies locking. No flag changes.
1667;
1668; Each function takes two arguments, first the pointer to the memory,
1669; then the pointer to the register. They all return void.
1670;
1671BEGINCODE
1672BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1673 PROLOGUE_2_ARGS
1674 mov T0_8, [A1]
1675 xchg [A0], T0_8
1676 mov [A1], T0_8
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_xchg_u8_locked
1679
1680BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 mov T0_16, [A1]
1683 xchg [A0], T0_16
1684 mov [A1], T0_16
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_xchg_u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1689 PROLOGUE_2_ARGS
1690 mov T0_32, [A1]
1691 xchg [A0], T0_32
1692 mov [A1], T0_32
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_xchg_u32_locked
1695
1696%ifdef RT_ARCH_AMD64
1697BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1698 PROLOGUE_2_ARGS
1699 mov T0, [A1]
1700 xchg [A0], T0
1701 mov [A1], T0
1702 EPILOGUE_2_ARGS
1703ENDPROC iemAImpl_xchg_u64_locked
1704%endif
1705
1706; Unlocked variants for fDisregardLock mode.
1707
1708BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1709 PROLOGUE_2_ARGS
1710 mov T0_8, [A1]
1711 mov T1_8, [A0]
1712 mov [A0], T0_8
1713 mov [A1], T1_8
1714 EPILOGUE_2_ARGS
1715ENDPROC iemAImpl_xchg_u8_unlocked
1716
1717BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1718 PROLOGUE_2_ARGS
1719 mov T0_16, [A1]
1720 mov T1_16, [A0]
1721 mov [A0], T0_16
1722 mov [A1], T1_16
1723 EPILOGUE_2_ARGS
1724ENDPROC iemAImpl_xchg_u16_unlocked
1725
1726BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1727 PROLOGUE_2_ARGS
1728 mov T0_32, [A1]
1729 mov T1_32, [A0]
1730 mov [A0], T0_32
1731 mov [A1], T1_32
1732 EPILOGUE_2_ARGS
1733ENDPROC iemAImpl_xchg_u32_unlocked
1734
1735%ifdef RT_ARCH_AMD64
1736BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1737 PROLOGUE_2_ARGS
1738 mov T0, [A1]
1739 mov T1, [A0]
1740 mov [A0], T0
1741 mov [A1], T1
1742 EPILOGUE_2_ARGS
1743ENDPROC iemAImpl_xchg_u64_unlocked
1744%endif
1745
1746
1747;
1748; XADD for memory operands.
1749;
1750; Each function takes three arguments, first the pointer to the
1751; memory/register, then the pointer to the register, and finally a pointer to
1752; eflags. They all return void.
1753;
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1756 PROLOGUE_3_ARGS
1757 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1758 mov T0_8, [A1]
1759 xadd [A0], T0_8
1760 mov [A1], T0_8
1761 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1762 EPILOGUE_3_ARGS
1763ENDPROC iemAImpl_xadd_u8
1764
1765BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1766 PROLOGUE_3_ARGS
1767 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1768 mov T0_16, [A1]
1769 xadd [A0], T0_16
1770 mov [A1], T0_16
1771 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_xadd_u16
1774
1775BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1778 mov T0_32, [A1]
1779 xadd [A0], T0_32
1780 mov [A1], T0_32
1781 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_xadd_u32
1784
1785%ifdef RT_ARCH_AMD64
1786BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1787 PROLOGUE_3_ARGS
1788 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1789 mov T0, [A1]
1790 xadd [A0], T0
1791 mov [A1], T0
1792 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1793 EPILOGUE_3_ARGS
1794ENDPROC iemAImpl_xadd_u64
1795%endif ; RT_ARCH_AMD64
1796
1797BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1798 PROLOGUE_3_ARGS
1799 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1800 mov T0_8, [A1]
1801 lock xadd [A0], T0_8
1802 mov [A1], T0_8
1803 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1804 EPILOGUE_3_ARGS
1805ENDPROC iemAImpl_xadd_u8_locked
1806
1807BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1808 PROLOGUE_3_ARGS
1809 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1810 mov T0_16, [A1]
1811 lock xadd [A0], T0_16
1812 mov [A1], T0_16
1813 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_xadd_u16_locked
1816
1817BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1820 mov T0_32, [A1]
1821 lock xadd [A0], T0_32
1822 mov [A1], T0_32
1823 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1824 EPILOGUE_3_ARGS
1825ENDPROC iemAImpl_xadd_u32_locked
1826
1827%ifdef RT_ARCH_AMD64
1828BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1829 PROLOGUE_3_ARGS
1830 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1831 mov T0, [A1]
1832 lock xadd [A0], T0
1833 mov [A1], T0
1834 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1835 EPILOGUE_3_ARGS
1836ENDPROC iemAImpl_xadd_u64_locked
1837%endif ; RT_ARCH_AMD64
1838
1839
1840;
1841; CMPXCHG8B.
1842;
1843; These are tricky register wise, so the code is duplicated for each calling
1844; convention.
1845;
1846; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1847;
1848; C-proto:
1849; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1850; uint32_t *pEFlags));
1851;
1852; Note! Identical to iemAImpl_cmpxchg16b.
1853;
1854BEGINCODE
1855BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1856%ifdef RT_ARCH_AMD64
1857 %ifdef ASM_CALL64_MSC
1858 push rbx
1859
1860 mov r11, rdx ; pu64EaxEdx (is also T1)
1861 mov r10, rcx ; pu64Dst
1862
1863 mov ebx, [r8]
1864 mov ecx, [r8 + 4]
1865 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1866 mov eax, [r11]
1867 mov edx, [r11 + 4]
1868
1869 cmpxchg8b [r10]
1870
1871 mov [r11], eax
1872 mov [r11 + 4], edx
1873 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1874
1875 pop rbx
1876 ret
1877 %else
1878 push rbx
1879
1880 mov r10, rcx ; pEFlags
1881 mov r11, rdx ; pu64EbxEcx (is also T1)
1882
1883 mov ebx, [r11]
1884 mov ecx, [r11 + 4]
1885 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1886 mov eax, [rsi]
1887 mov edx, [rsi + 4]
1888
1889 cmpxchg8b [rdi]
1890
1891 mov [rsi], eax
1892 mov [rsi + 4], edx
1893 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1894
1895 pop rbx
1896 ret
1897
1898 %endif
1899%else
1900 push esi
1901 push edi
1902 push ebx
1903 push ebp
1904
1905 mov edi, ecx ; pu64Dst
1906 mov esi, edx ; pu64EaxEdx
1907 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1908 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1909
1910 mov ebx, [ecx]
1911 mov ecx, [ecx + 4]
1912 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1913 mov eax, [esi]
1914 mov edx, [esi + 4]
1915
1916 cmpxchg8b [edi]
1917
1918 mov [esi], eax
1919 mov [esi + 4], edx
1920 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1921
1922 pop ebp
1923 pop ebx
1924 pop edi
1925 pop esi
1926 ret 8
1927%endif
1928ENDPROC iemAImpl_cmpxchg8b
1929
1930BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1931%ifdef RT_ARCH_AMD64
1932 %ifdef ASM_CALL64_MSC
1933 push rbx
1934
1935 mov r11, rdx ; pu64EaxEdx (is also T1)
1936 mov r10, rcx ; pu64Dst
1937
1938 mov ebx, [r8]
1939 mov ecx, [r8 + 4]
1940 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1941 mov eax, [r11]
1942 mov edx, [r11 + 4]
1943
1944 lock cmpxchg8b [r10]
1945
1946 mov [r11], eax
1947 mov [r11 + 4], edx
1948 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1949
1950 pop rbx
1951 ret
1952 %else
1953 push rbx
1954
1955 mov r10, rcx ; pEFlags
1956 mov r11, rdx ; pu64EbxEcx (is also T1)
1957
1958 mov ebx, [r11]
1959 mov ecx, [r11 + 4]
1960 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1961 mov eax, [rsi]
1962 mov edx, [rsi + 4]
1963
1964 lock cmpxchg8b [rdi]
1965
1966 mov [rsi], eax
1967 mov [rsi + 4], edx
1968 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1969
1970 pop rbx
1971 ret
1972
1973 %endif
1974%else
1975 push esi
1976 push edi
1977 push ebx
1978 push ebp
1979
1980 mov edi, ecx ; pu64Dst
1981 mov esi, edx ; pu64EaxEdx
1982 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1983 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1984
1985 mov ebx, [ecx]
1986 mov ecx, [ecx + 4]
1987 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1988 mov eax, [esi]
1989 mov edx, [esi + 4]
1990
1991 lock cmpxchg8b [edi]
1992
1993 mov [esi], eax
1994 mov [esi + 4], edx
1995 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1996
1997 pop ebp
1998 pop ebx
1999 pop edi
2000 pop esi
2001 ret 8
2002%endif
2003ENDPROC iemAImpl_cmpxchg8b_locked
2004
2005%ifdef RT_ARCH_AMD64
2006
2007;
2008; CMPXCHG16B.
2009;
2010; These are tricky register wise, so the code is duplicated for each calling
2011; convention.
2012;
2013; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2014;
2015; C-proto:
2016; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2017; uint32_t *pEFlags));
2018;
2019; Note! Identical to iemAImpl_cmpxchg8b.
2020;
2021BEGINCODE
2022BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2023 %ifdef ASM_CALL64_MSC
2024 push rbx
2025
2026 mov r11, rdx ; pu64RaxRdx (is also T1)
2027 mov r10, rcx ; pu64Dst
2028
2029 mov rbx, [r8]
2030 mov rcx, [r8 + 8]
2031 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2032 mov rax, [r11]
2033 mov rdx, [r11 + 8]
2034
2035 cmpxchg16b [r10]
2036
2037 mov [r11], rax
2038 mov [r11 + 8], rdx
2039 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2040
2041 pop rbx
2042 ret
2043 %else
2044 push rbx
2045
2046 mov r10, rcx ; pEFlags
2047 mov r11, rdx ; pu64RbxRcx (is also T1)
2048
2049 mov rbx, [r11]
2050 mov rcx, [r11 + 8]
2051 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2052 mov rax, [rsi]
2053 mov rdx, [rsi + 8]
2054
2055 cmpxchg16b [rdi]
2056
2057 mov [rsi], rax
2058 mov [rsi + 8], rdx
2059 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2060
2061 pop rbx
2062 ret
2063
2064 %endif
2065ENDPROC iemAImpl_cmpxchg16b
2066
2067BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2068 %ifdef ASM_CALL64_MSC
2069 push rbx
2070
2071 mov r11, rdx ; pu64RaxRdx (is also T1)
2072 mov r10, rcx ; pu64Dst
2073
2074 mov rbx, [r8]
2075 mov rcx, [r8 + 8]
2076 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2077 mov rax, [r11]
2078 mov rdx, [r11 + 8]
2079
2080 lock cmpxchg16b [r10]
2081
2082 mov [r11], rax
2083 mov [r11 + 8], rdx
2084 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2085
2086 pop rbx
2087 ret
2088 %else
2089 push rbx
2090
2091 mov r10, rcx ; pEFlags
2092 mov r11, rdx ; pu64RbxRcx (is also T1)
2093
2094 mov rbx, [r11]
2095 mov rcx, [r11 + 8]
2096 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2097 mov rax, [rsi]
2098 mov rdx, [rsi + 8]
2099
2100 lock cmpxchg16b [rdi]
2101
2102 mov [rsi], rax
2103 mov [rsi + 8], rdx
2104 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2105
2106 pop rbx
2107 ret
2108
2109 %endif
2110ENDPROC iemAImpl_cmpxchg16b_locked
2111
2112%endif ; RT_ARCH_AMD64
2113
2114
2115;
2116; CMPXCHG.
2117;
2118; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2119;
2120; C-proto:
2121; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2122;
2123BEGINCODE
2124%macro IEMIMPL_CMPXCHG 2
2125BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2128 mov al, [A1]
2129 %1 cmpxchg [A0], A2_8
2130 mov [A1], al
2131 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2132 EPILOGUE_4_ARGS
2133ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2134
2135BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2136 PROLOGUE_4_ARGS
2137 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2138 mov ax, [A1]
2139 %1 cmpxchg [A0], A2_16
2140 mov [A1], ax
2141 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2144
2145BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2146 PROLOGUE_4_ARGS
2147 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2148 mov eax, [A1]
2149 %1 cmpxchg [A0], A2_32
2150 mov [A1], eax
2151 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2152 EPILOGUE_4_ARGS
2153ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2154
2155BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2156%ifdef RT_ARCH_AMD64
2157 PROLOGUE_4_ARGS
2158 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2159 mov rax, [A1]
2160 %1 cmpxchg [A0], A2
2161 mov [A1], rax
2162 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2163 EPILOGUE_4_ARGS
2164%else
2165 ;
2166 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2167 ;
2168 push esi
2169 push edi
2170 push ebx
2171 push ebp
2172
2173 mov edi, ecx ; pu64Dst
2174 mov esi, edx ; pu64Rax
2175 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2176 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2177
2178 mov ebx, [ecx]
2179 mov ecx, [ecx + 4]
2180 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2181 mov eax, [esi]
2182 mov edx, [esi + 4]
2183
2184 lock cmpxchg8b [edi]
2185
2186 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2187 jz .cmpxchg8b_not_equal
2188;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2189 cmp eax, eax ; just set the other flags.
2190.store:
2191 mov [esi], eax
2192 mov [esi + 4], edx
2193 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2194
2195 pop ebp
2196 pop ebx
2197 pop edi
2198 pop esi
2199 ret 8
2200
2201.cmpxchg8b_not_equal:
2202 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2203 jne .store
2204 cmp [esi], eax
2205 jmp .store
2206
2207%endif
2208ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2209%endmacro ; IEMIMPL_CMPXCHG
2210
2211IEMIMPL_CMPXCHG , ,
2212IEMIMPL_CMPXCHG lock, _locked
2213
2214
2215
2216;;
2217; Macro for implementing a unary operator.
2218;
2219; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2220; variants, except on 32-bit system where the 64-bit accesses requires hand
2221; coding.
2222;
2223; All the functions takes a pointer to the destination memory operand in A0,
2224; the source register operand in A1 and a pointer to eflags in A2.
2225;
2226; @param 1 The instruction mnemonic.
2227; @param 2 The modified flags.
2228; @param 3 The undefined flags.
2229;
2230%macro IEMIMPL_UNARY_OP 3
2231BEGINCODE
2232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2233 PROLOGUE_2_ARGS
2234 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2235 %1 byte [A0]
2236 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2237 EPILOGUE_2_ARGS
2238ENDPROC iemAImpl_ %+ %1 %+ _u8
2239
2240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2241 PROLOGUE_2_ARGS
2242 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2243 lock %1 byte [A0]
2244 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2245 EPILOGUE_2_ARGS
2246ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2247
2248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2249 PROLOGUE_2_ARGS
2250 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2251 %1 word [A0]
2252 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2253 EPILOGUE_2_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u16
2255
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2257 PROLOGUE_2_ARGS
2258 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2259 lock %1 word [A0]
2260 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2261 EPILOGUE_2_ARGS
2262ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2263
2264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2265 PROLOGUE_2_ARGS
2266 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2267 %1 dword [A0]
2268 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2269 EPILOGUE_2_ARGS
2270ENDPROC iemAImpl_ %+ %1 %+ _u32
2271
2272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2273 PROLOGUE_2_ARGS
2274 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2275 lock %1 dword [A0]
2276 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2277 EPILOGUE_2_ARGS
2278ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2279
2280 %ifdef RT_ARCH_AMD64
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2282 PROLOGUE_2_ARGS
2283 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2284 %1 qword [A0]
2285 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2286 EPILOGUE_2_ARGS
2287ENDPROC iemAImpl_ %+ %1 %+ _u64
2288
2289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2290 PROLOGUE_2_ARGS
2291 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2292 lock %1 qword [A0]
2293 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2294 EPILOGUE_2_ARGS
2295ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2296 %endif ; RT_ARCH_AMD64
2297
2298%endmacro
2299
2300IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2303IEMIMPL_UNARY_OP not, 0, 0
2304
2305
2306;
2307; BSWAP. No flag changes.
2308;
2309; Each function takes one argument, pointer to the value to bswap
2310; (input/output). They all return void.
2311;
2312BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2313 PROLOGUE_1_ARGS
2314 mov T0_32, [A0] ; just in case any of the upper bits are used.
2315 db 66h
2316 bswap T0_32
2317 mov [A0], T0_32
2318 EPILOGUE_1_ARGS
2319ENDPROC iemAImpl_bswap_u16
2320
2321BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2322 PROLOGUE_1_ARGS
2323 mov T0_32, [A0]
2324 bswap T0_32
2325 mov [A0], T0_32
2326 EPILOGUE_1_ARGS
2327ENDPROC iemAImpl_bswap_u32
2328
2329BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2330%ifdef RT_ARCH_AMD64
2331 PROLOGUE_1_ARGS
2332 mov T0, [A0]
2333 bswap T0
2334 mov [A0], T0
2335 EPILOGUE_1_ARGS
2336%else
2337 PROLOGUE_1_ARGS
2338 mov T0, [A0]
2339 mov T1, [A0 + 4]
2340 bswap T0
2341 bswap T1
2342 mov [A0 + 4], T0
2343 mov [A0], T1
2344 EPILOGUE_1_ARGS
2345%endif
2346ENDPROC iemAImpl_bswap_u64
2347
2348
2349;;
2350; Macro for implementing a shift operation.
2351;
2352; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2353; 32-bit system where the 64-bit accesses requires hand coding.
2354;
2355; All the functions takes a pointer to the destination memory operand in A0,
2356; the shift count in A1 and a pointer to eflags in A2.
2357;
2358; @param 1 The instruction mnemonic.
2359; @param 2 The modified flags.
2360; @param 3 The undefined flags.
2361; @param 4 Force load flags.
2362;
2363; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2364; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2365; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2366; so we have to switch it around with the shift count parameter registers.
2367;
2368; @note the _intel and _amd variants are implemented in C.
2369;
2370%macro IEMIMPL_SHIFT_OP 4
2371BEGINCODE
2372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2373 PROLOGUE_3_ARGS
2374 %ifdef ASM_CALL64_GCC
2375 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2376 mov cl, A2_8
2377 %1 byte [A1], cl
2378 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2379 %else
2380 xchg A2, A0
2381 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2382 %1 byte [A1], cl
2383 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2384 %endif
2385.zero_shift:
2386 EPILOGUE_3_ARGS
2387ENDPROC iemAImpl_ %+ %1 %+ _u8
2388
2389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2390 PROLOGUE_3_ARGS
2391 %ifdef ASM_CALL64_GCC
2392 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2393 mov cl, A2_8
2394 %1 word [A1], cl
2395 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2396 %else
2397 xchg A2, A0
2398 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2399 %1 word [A1], cl
2400 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2401 %endif
2402 EPILOGUE_3_ARGS
2403ENDPROC iemAImpl_ %+ %1 %+ _u16
2404
2405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2406 PROLOGUE_3_ARGS
2407 %ifdef ASM_CALL64_GCC
2408 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2409 mov cl, A2_8
2410 %1 dword [A1], cl
2411 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2412 %else
2413 xchg A2, A0
2414 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2415 %1 dword [A1], cl
2416 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2417 %endif
2418 EPILOGUE_3_ARGS
2419ENDPROC iemAImpl_ %+ %1 %+ _u32
2420
2421 %ifdef RT_ARCH_AMD64
2422BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2423 PROLOGUE_3_ARGS
2424 %ifdef ASM_CALL64_GCC
2425 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2426 mov cl, A2_8
2427 %1 qword [A1], cl
2428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2429 %else
2430 xchg A2, A0
2431 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2432 %1 qword [A1], cl
2433 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2434 %endif
2435 EPILOGUE_3_ARGS
2436ENDPROC iemAImpl_ %+ %1 %+ _u64
2437 %endif ; RT_ARCH_AMD64
2438
2439%endmacro
2440
2441; These instructions will NOT modify flags if the masked shift count is zero
2442; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2443; we have to force load all modified and undefined.
2444IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2445IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2449IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451
2452
2453;;
2454; Macro for implementing a double precision shift operation.
2455;
2456; This will generate code for the 16, 32 and 64 bit accesses, except on
2457; 32-bit system where the 64-bit accesses requires hand coding.
2458;
2459; The functions takes the destination operand (r/m) in A0, the source (reg) in
2460; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2461;
2462; @param 1 The instruction mnemonic.
2463; @param 2 The modified flags.
2464; @param 3 The undefined flags.
2465; @param 4 The force loaded flags.
2466;
2467; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2468;
2469; @note the _intel and _amd variants are implemented in C.
2470;
2471%macro IEMIMPL_SHIFT_DBL_OP 4
2472BEGINCODE
2473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2474 PROLOGUE_4_ARGS
2475 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2476 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2477 %ifdef ASM_CALL64_GCC
2478 xchg A3, A2
2479 %1 [A0], A1_16, cl
2480 xchg A3, A2
2481 %else
2482 xchg A0, A2
2483 %1 [A2], A1_16, cl
2484 %endif
2485 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2486 EPILOGUE_4_ARGS
2487ENDPROC iemAImpl_ %+ %1 %+ _u16
2488
2489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2490 PROLOGUE_4_ARGS
2491 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2492 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2493 %ifdef ASM_CALL64_GCC
2494 xchg A3, A2
2495 %1 [A0], A1_32, cl
2496 xchg A3, A2
2497 %else
2498 xchg A0, A2
2499 %1 [A2], A1_32, cl
2500 %endif
2501 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2502 EPILOGUE_4_ARGS
2503ENDPROC iemAImpl_ %+ %1 %+ _u32
2504
2505 %ifdef RT_ARCH_AMD64
2506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2507 PROLOGUE_4_ARGS
2508 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2509 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2510 %ifdef ASM_CALL64_GCC
2511 xchg A3, A2
2512 %1 [A0], A1, cl
2513 xchg A3, A2
2514 %else
2515 xchg A0, A2
2516 %1 [A2], A1, cl
2517 %endif
2518 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2519 EPILOGUE_4_ARGS_EX 12
2520ENDPROC iemAImpl_ %+ %1 %+ _u64
2521 %endif ; RT_ARCH_AMD64
2522
2523%endmacro
2524
2525; These instructions will NOT modify flags if the masked shift count is zero
2526; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2527; we have to force load all modified and undefined.
2528IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2529IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530
2531
2532;;
2533; Macro for implementing a multiplication operations.
2534;
2535; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2536; 32-bit system where the 64-bit accesses requires hand coding.
2537;
2538; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2539; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2540; pointer to eflags in A3.
2541;
2542; The functions all return 0 so the caller can be used for div/idiv as well as
2543; for the mul/imul implementation.
2544;
2545; @param 1 The instruction mnemonic.
2546; @param 2 The modified flags.
2547; @param 3 The undefined flags.
2548; @param 4 Name suffix.
2549; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2550;
2551; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2552;
2553%macro IEMIMPL_MUL_OP 5
2554BEGINCODE
2555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2556 PROLOGUE_3_ARGS
2557 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2558 mov al, [A0]
2559 %1 A1_8
2560 mov [A0], ax
2561 %if %5 != 1
2562 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2563 %else
2564 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2565 %endif
2566 xor eax, eax
2567 EPILOGUE_3_ARGS
2568ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2569
2570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2571 PROLOGUE_4_ARGS
2572 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2573 mov ax, [A0]
2574 %ifdef ASM_CALL64_GCC
2575 %1 A2_16
2576 mov [A0], ax
2577 mov [A1], dx
2578 %else
2579 mov T1, A1
2580 %1 A2_16
2581 mov [A0], ax
2582 mov [T1], dx
2583 %endif
2584 %if %5 != 1
2585 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2586 %else
2587 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2588 %endif
2589 xor eax, eax
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2592
2593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2594 PROLOGUE_4_ARGS
2595 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2596 mov eax, [A0]
2597 %ifdef ASM_CALL64_GCC
2598 %1 A2_32
2599 mov [A0], eax
2600 mov [A1], edx
2601 %else
2602 mov T1, A1
2603 %1 A2_32
2604 mov [A0], eax
2605 mov [T1], edx
2606 %endif
2607 %if %5 != 1
2608 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2609 %else
2610 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2611 %endif
2612 xor eax, eax
2613 EPILOGUE_4_ARGS
2614ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2615
2616 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2618 PROLOGUE_4_ARGS
2619 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2620 mov rax, [A0]
2621 %ifdef ASM_CALL64_GCC
2622 %1 A2
2623 mov [A0], rax
2624 mov [A1], rdx
2625 %else
2626 mov T1, A1
2627 %1 A2
2628 mov [A0], rax
2629 mov [T1], rdx
2630 %endif
2631 %if %5 != 1
2632 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2633 %else
2634 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2635 %endif
2636 xor eax, eax
2637 EPILOGUE_4_ARGS_EX 12
2638ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2639 %endif ; !RT_ARCH_AMD64
2640
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2691; pointer to eflags in A3.
2692;
2693; The functions all return 0 on success and -1 if a divide error should be
2694; raised by the caller.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2761 %endif
2762 xor eax, eax
2763
2764.return:
2765 EPILOGUE_3_ARGS
2766
2767.div_zero:
2768.div_overflow:
2769 mov eax, -1
2770 jmp .return
2771ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2772
2773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2774 PROLOGUE_4_ARGS
2775
2776 ; div by chainsaw check.
2777 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2778 jz .div_zero
2779
2780 ; Overflow check - unsigned division is simple to verify, haven't
2781 ; found a simple way to check signed division yet unfortunately.
2782 %if %4 == 0
2783 cmp [A1], A2_16
2784 jae .div_overflow
2785 %else
2786 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2787 shl T0_32, 16
2788 mov T0_16, [A0] ; T0 = dividend
2789 mov T1, A2 ; T1 = divisor
2790 test T1_16, T1_16
2791 js .divisor_negative
2792 test T0_32, T0_32
2793 jns .both_positive
2794 neg T0_32
2795.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2796 push T0 ; Start off like unsigned below.
2797 shr T0_32, 15
2798 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2799 pop T0
2800 jb .div_no_overflow
2801 ja .div_overflow
2802 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2803 cmp T0_16, T1_16
2804 jae .div_overflow
2805 jmp .div_no_overflow
2806
2807.divisor_negative:
2808 neg T1_16
2809 test T0_32, T0_32
2810 jns .one_of_each
2811 neg T0_32
2812.both_positive: ; Same as unsigned shifted by sign indicator bit.
2813 shr T0_32, 15
2814 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2815 jae .div_overflow
2816.div_no_overflow:
2817 %endif
2818
2819 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2820 %ifdef ASM_CALL64_GCC
2821 mov T1, A2
2822 mov ax, [A0]
2823 mov dx, [A1]
2824 %1 T1_16
2825 mov [A0], ax
2826 mov [A1], dx
2827 %else
2828 mov T1, A1
2829 mov ax, [A0]
2830 mov dx, [T1]
2831 %1 A2_16
2832 mov [A0], ax
2833 mov [T1], dx
2834 %endif
2835 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2836 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2837 %else
2838 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2839 %endif
2840 xor eax, eax
2841
2842.return:
2843 EPILOGUE_4_ARGS
2844
2845.div_zero:
2846.div_overflow:
2847 mov eax, -1
2848 jmp .return
2849ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2850
2851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2852 PROLOGUE_4_ARGS
2853
2854 ; div by chainsaw check.
2855 test A2_32, A2_32
2856 jz .div_zero
2857
2858 ; Overflow check - unsigned division is simple to verify, haven't
2859 ; found a simple way to check signed division yet unfortunately.
2860 %if %4 == 0
2861 cmp [A1], A2_32
2862 jae .div_overflow
2863 %else
2864 push A2 ; save A2 so we modify it (we out of regs on x86).
2865 mov T0_32, [A0] ; T0 = dividend low
2866 mov T1_32, [A1] ; T1 = dividend high
2867 ;test A2_32, A2_32 - we did this 5 instructions ago.
2868 js .divisor_negative
2869 test T1_32, T1_32
2870 jns .both_positive
2871 call NAME(iemAImpl_negate_T0_T1_u32)
2872.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2873 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2874 jnz .div_overflow
2875 push T0 ; Start off like unsigned below.
2876 shl T1_32, 1
2877 shr T0_32, 31
2878 or T1_32, T0_32
2879 cmp T1_32, A2_32
2880 pop T0
2881 jb .div_no_overflow
2882 ja .div_overflow
2883 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2884 cmp T0_32, A2_32
2885 jae .div_overflow
2886 jmp .div_no_overflow
2887
2888.divisor_negative:
2889 neg A2_32
2890 test T1_32, T1_32
2891 jns .one_of_each
2892 call NAME(iemAImpl_negate_T0_T1_u32)
2893.both_positive: ; Same as unsigned shifted by sign indicator bit.
2894 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2895 jnz .div_overflow
2896 shl T1_32, 1
2897 shr T0_32, 31
2898 or T1_32, T0_32
2899 cmp T1_32, A2_32
2900 jae .div_overflow
2901.div_no_overflow:
2902 pop A2
2903 %endif
2904
2905 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2906 mov eax, [A0]
2907 %ifdef ASM_CALL64_GCC
2908 mov T1, A2
2909 mov eax, [A0]
2910 mov edx, [A1]
2911 %1 T1_32
2912 mov [A0], eax
2913 mov [A1], edx
2914 %else
2915 mov T1, A1
2916 mov eax, [A0]
2917 mov edx, [T1]
2918 %1 A2_32
2919 mov [A0], eax
2920 mov [T1], edx
2921 %endif
2922 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2923 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2924 %else
2925 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2926 %endif
2927 xor eax, eax
2928
2929.return:
2930 EPILOGUE_4_ARGS
2931
2932.div_overflow:
2933 %if %4 != 0
2934 pop A2
2935 %endif
2936.div_zero:
2937 mov eax, -1
2938 jmp .return
2939ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2940
2941 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2942BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2943 PROLOGUE_4_ARGS
2944
2945 test A2, A2
2946 jz .div_zero
2947 %if %4 == 0
2948 cmp [A1], A2
2949 jae .div_overflow
2950 %else
2951 push A2 ; save A2 so we modify it (we out of regs on x86).
2952 mov T0, [A0] ; T0 = dividend low
2953 mov T1, [A1] ; T1 = dividend high
2954 ;test A2, A2 - we did this five instructions above.
2955 js .divisor_negative
2956 test T1, T1
2957 jns .both_positive
2958 call NAME(iemAImpl_negate_T0_T1_u64)
2959.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2960 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2961 jc .div_overflow
2962 push T0 ; Start off like unsigned below.
2963 shl T1, 1
2964 shr T0, 63
2965 or T1, T0
2966 cmp T1, A2
2967 pop T0
2968 jb .div_no_overflow
2969 ja .div_overflow
2970 mov T1, 0x7fffffffffffffff
2971 and T0, T1 ; Special case for covering (divisor - 1).
2972 cmp T0, A2
2973 jae .div_overflow
2974 jmp .div_no_overflow
2975
2976.divisor_negative:
2977 neg A2
2978 test T1, T1
2979 jns .one_of_each
2980 call NAME(iemAImpl_negate_T0_T1_u64)
2981.both_positive: ; Same as unsigned shifted by sign indicator bit.
2982 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2983 jc .div_overflow
2984 shl T1, 1
2985 shr T0, 63
2986 or T1, T0
2987 cmp T1, A2
2988 jae .div_overflow
2989.div_no_overflow:
2990 pop A2
2991 %endif
2992
2993 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2994 mov rax, [A0]
2995 %ifdef ASM_CALL64_GCC
2996 mov T1, A2
2997 mov rax, [A0]
2998 mov rdx, [A1]
2999 %1 T1
3000 mov [A0], rax
3001 mov [A1], rdx
3002 %else
3003 mov T1, A1
3004 mov rax, [A0]
3005 mov rdx, [T1]
3006 %1 A2
3007 mov [A0], rax
3008 mov [T1], rdx
3009 %endif
3010 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3011 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3012 %else
3013 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3014 %endif
3015 xor eax, eax
3016
3017.return:
3018 EPILOGUE_4_ARGS_EX 12
3019
3020.div_overflow:
3021 %if %4 != 0
3022 pop A2
3023 %endif
3024.div_zero:
3025 mov eax, -1
3026 jmp .return
3027ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3028 %endif ; !RT_ARCH_AMD64
3029
3030%endmacro
3031
3032IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3033IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3034IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3035;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3036IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3037IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3038IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3039
3040
3041;;
3042; Macro for implementing memory fence operation.
3043;
3044; No return value, no operands or anything.
3045;
3046; @param 1 The instruction.
3047;
3048%macro IEMIMPL_MEM_FENCE 1
3049BEGINCODE
3050BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3051 %1
3052 ret
3053ENDPROC iemAImpl_ %+ %1
3054%endmacro
3055
3056IEMIMPL_MEM_FENCE lfence
3057IEMIMPL_MEM_FENCE sfence
3058IEMIMPL_MEM_FENCE mfence
3059
3060;;
3061; Alternative for non-SSE2 host.
3062;
3063BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3064 push xAX
3065 xchg xAX, [xSP]
3066 add xSP, xCB
3067 ret
3068ENDPROC iemAImpl_alt_mem_fence
3069
3070
3071;;
3072; Initialize the FPU for the actual instruction being emulated, this means
3073; loading parts of the guest's control word and status word.
3074;
3075; @uses 24 bytes of stack. T0, T1
3076; @param 1 Expression giving the address of the FXSTATE of the guest.
3077;
3078%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3079 fnstenv [xSP]
3080
3081 ; FCW - for exception, precision and rounding control.
3082 movzx T0, word [%1 + X86FXSTATE.FCW]
3083 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3084 mov [xSP + X86FSTENV32P.FCW], T0_16
3085
3086 ; FSW - for undefined C0, C1, C2, and C3.
3087 movzx T1, word [%1 + X86FXSTATE.FSW]
3088 and T1, X86_FSW_C_MASK
3089 movzx T0, word [xSP + X86FSTENV32P.FSW]
3090 and T0, X86_FSW_TOP_MASK
3091 or T0, T1
3092 mov [xSP + X86FSTENV32P.FSW], T0_16
3093
3094 fldenv [xSP]
3095%endmacro
3096
3097
3098;;
3099; Initialize the FPU for the actual instruction being emulated, this means
3100; loading parts of the guest's control word, status word, and update the
3101; tag word for the top register if it's empty.
3102;
3103; ASSUMES actual TOP=7
3104;
3105; @uses 24 bytes of stack. T0, T1
3106; @param 1 Expression giving the address of the FXSTATE of the guest.
3107;
3108%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3109 fnstenv [xSP]
3110
3111 ; FCW - for exception, precision and rounding control.
3112 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3113 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3114 mov [xSP + X86FSTENV32P.FCW], T0_16
3115
3116 ; FSW - for undefined C0, C1, C2, and C3.
3117 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3118 and T1_32, X86_FSW_C_MASK
3119 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3120 and T0_32, X86_FSW_TOP_MASK
3121 or T0_32, T1_32
3122 mov [xSP + X86FSTENV32P.FSW], T0_16
3123
3124 ; FTW - Only for ST0 (in/out).
3125 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3126 shr T1_32, X86_FSW_TOP_SHIFT
3127 and T1_32, X86_FSW_TOP_SMASK
3128 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3129 jc %%st0_not_empty
3130 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3131%%st0_not_empty:
3132
3133 fldenv [xSP]
3134%endmacro
3135
3136
3137;;
3138; Need to move this as well somewhere better?
3139;
3140struc IEMFPURESULT
3141 .r80Result resw 5
3142 .FSW resw 1
3143endstruc
3144
3145
3146;;
3147; Need to move this as well somewhere better?
3148;
3149struc IEMFPURESULTTWO
3150 .r80Result1 resw 5
3151 .FSW resw 1
3152 .r80Result2 resw 5
3153endstruc
3154
3155
3156;
3157;---------------------- 16-bit signed integer operations ----------------------
3158;
3159
3160
3161;;
3162; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3163;
3164; @param A0 FPU context (fxsave).
3165; @param A1 Pointer to a IEMFPURESULT for the output.
3166; @param A2 Pointer to the 16-bit floating point value to convert.
3167;
3168BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3169 PROLOGUE_3_ARGS
3170 sub xSP, 20h
3171
3172 fninit
3173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3174 fild word [A2]
3175
3176 fnstsw word [A1 + IEMFPURESULT.FSW]
3177 fnclex
3178 fstp tword [A1 + IEMFPURESULT.r80Result]
3179
3180 fninit
3181 add xSP, 20h
3182 EPILOGUE_3_ARGS
3183ENDPROC iemAImpl_fild_r80_from_i16
3184
3185
3186;;
3187; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Where to return the output FSW.
3191; @param A2 Where to store the 16-bit signed integer value.
3192; @param A3 Pointer to the 80-bit value.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3195 PROLOGUE_4_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 fld tword [A3]
3200 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3201 fistp word [A2]
3202
3203 fnstsw word [A1]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_4_ARGS
3208ENDPROC iemAImpl_fist_r80_to_i16
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 16-bit signed integer
3213; (memory) with truncation.
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 16-bit signed integer value.
3218; @param A3 Pointer to the 80-bit value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fisttp word [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fistt_r80_to_i16
3235
3236
3237;;
3238; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3239;
3240; @param 1 The instruction
3241;
3242; @param A0 FPU context (fxsave).
3243; @param A1 Pointer to a IEMFPURESULT for the output.
3244; @param A2 Pointer to the 80-bit value.
3245; @param A3 Pointer to the 16-bit value.
3246;
3247%macro IEMIMPL_FPU_R80_BY_I16 1
3248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3249 PROLOGUE_4_ARGS
3250 sub xSP, 20h
3251
3252 fninit
3253 fld tword [A2]
3254 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3255 %1 word [A3]
3256
3257 fnstsw word [A1 + IEMFPURESULT.FSW]
3258 fnclex
3259 fstp tword [A1 + IEMFPURESULT.r80Result]
3260
3261 fninit
3262 add xSP, 20h
3263 EPILOGUE_4_ARGS
3264ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3265%endmacro
3266
3267IEMIMPL_FPU_R80_BY_I16 fiadd
3268IEMIMPL_FPU_R80_BY_I16 fimul
3269IEMIMPL_FPU_R80_BY_I16 fisub
3270IEMIMPL_FPU_R80_BY_I16 fisubr
3271IEMIMPL_FPU_R80_BY_I16 fidiv
3272IEMIMPL_FPU_R80_BY_I16 fidivr
3273
3274
3275;;
3276; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3277; only returning FSW.
3278;
3279; @param 1 The instruction
3280;
3281; @param A0 FPU context (fxsave).
3282; @param A1 Where to store the output FSW.
3283; @param A2 Pointer to the 80-bit value.
3284; @param A3 Pointer to the 64-bit value.
3285;
3286%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3288 PROLOGUE_4_ARGS
3289 sub xSP, 20h
3290
3291 fninit
3292 fld tword [A2]
3293 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3294 %1 word [A3]
3295
3296 fnstsw word [A1]
3297
3298 fninit
3299 add xSP, 20h
3300 EPILOGUE_4_ARGS
3301ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3302%endmacro
3303
3304IEMIMPL_FPU_R80_BY_I16_FSW ficom
3305
3306
3307
3308;
3309;---------------------- 32-bit signed integer operations ----------------------
3310;
3311
3312
3313;;
3314; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Pointer to a IEMFPURESULT for the output.
3318; @param A2 Pointer to the 32-bit floating point value to convert.
3319;
3320BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3321 PROLOGUE_3_ARGS
3322 sub xSP, 20h
3323
3324 fninit
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fild dword [A2]
3327
3328 fnstsw word [A1 + IEMFPURESULT.FSW]
3329 fnclex
3330 fstp tword [A1 + IEMFPURESULT.r80Result]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_3_ARGS
3335ENDPROC iemAImpl_fild_r80_from_i32
3336
3337
3338;;
3339; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Where to return the output FSW.
3343; @param A2 Where to store the 32-bit signed integer value.
3344; @param A3 Pointer to the 80-bit value.
3345;
3346BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3347 PROLOGUE_4_ARGS
3348 sub xSP, 20h
3349
3350 fninit
3351 fld tword [A3]
3352 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3353 fistp dword [A2]
3354
3355 fnstsw word [A1]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_4_ARGS
3360ENDPROC iemAImpl_fist_r80_to_i32
3361
3362
3363;;
3364; Store a 80-bit floating point value (register) as a 32-bit signed integer
3365; (memory) with truncation.
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Where to return the output FSW.
3369; @param A2 Where to store the 32-bit signed integer value.
3370; @param A3 Pointer to the 80-bit value.
3371;
3372BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A3]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 fisttp dword [A2]
3380
3381 fnstsw word [A1]
3382
3383 fninit
3384 add xSP, 20h
3385 EPILOGUE_4_ARGS
3386ENDPROC iemAImpl_fistt_r80_to_i32
3387
3388
3389;;
3390; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3391;
3392; @param 1 The instruction
3393;
3394; @param A0 FPU context (fxsave).
3395; @param A1 Pointer to a IEMFPURESULT for the output.
3396; @param A2 Pointer to the 80-bit value.
3397; @param A3 Pointer to the 32-bit value.
3398;
3399%macro IEMIMPL_FPU_R80_BY_I32 1
3400BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3401 PROLOGUE_4_ARGS
3402 sub xSP, 20h
3403
3404 fninit
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 dword [A3]
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_I32 fiadd
3420IEMIMPL_FPU_R80_BY_I32 fimul
3421IEMIMPL_FPU_R80_BY_I32 fisub
3422IEMIMPL_FPU_R80_BY_I32 fisubr
3423IEMIMPL_FPU_R80_BY_I32 fidiv
3424IEMIMPL_FPU_R80_BY_I32 fidivr
3425
3426
3427;;
3428; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3429; only returning FSW.
3430;
3431; @param 1 The instruction
3432;
3433; @param A0 FPU context (fxsave).
3434; @param A1 Where to store the output FSW.
3435; @param A2 Pointer to the 80-bit value.
3436; @param A3 Pointer to the 64-bit value.
3437;
3438%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3440 PROLOGUE_4_ARGS
3441 sub xSP, 20h
3442
3443 fninit
3444 fld tword [A2]
3445 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3446 %1 dword [A3]
3447
3448 fnstsw word [A1]
3449
3450 fninit
3451 add xSP, 20h
3452 EPILOGUE_4_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3454%endmacro
3455
3456IEMIMPL_FPU_R80_BY_I32_FSW ficom
3457
3458
3459
3460;
3461;---------------------- 64-bit signed integer operations ----------------------
3462;
3463
3464
3465;;
3466; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to a IEMFPURESULT for the output.
3470; @param A2 Pointer to the 64-bit floating point value to convert.
3471;
3472BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3473 PROLOGUE_3_ARGS
3474 sub xSP, 20h
3475
3476 fninit
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3478 fild qword [A2]
3479
3480 fnstsw word [A1 + IEMFPURESULT.FSW]
3481 fnclex
3482 fstp tword [A1 + IEMFPURESULT.r80Result]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_fild_r80_from_i64
3488
3489
3490;;
3491; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3492;
3493; @param A0 FPU context (fxsave).
3494; @param A1 Where to return the output FSW.
3495; @param A2 Where to store the 64-bit signed integer value.
3496; @param A3 Pointer to the 80-bit value.
3497;
3498BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3499 PROLOGUE_4_ARGS
3500 sub xSP, 20h
3501
3502 fninit
3503 fld tword [A3]
3504 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3505 fistp qword [A2]
3506
3507 fnstsw word [A1]
3508
3509 fninit
3510 add xSP, 20h
3511 EPILOGUE_4_ARGS
3512ENDPROC iemAImpl_fist_r80_to_i64
3513
3514
3515;;
3516; Store a 80-bit floating point value (register) as a 64-bit signed integer
3517; (memory) with truncation.
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Where to return the output FSW.
3521; @param A2 Where to store the 64-bit signed integer value.
3522; @param A3 Pointer to the 80-bit value.
3523;
3524BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3525 PROLOGUE_4_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A3]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 fisttp qword [A2]
3532
3533 fnstsw word [A1]
3534
3535 fninit
3536 add xSP, 20h
3537 EPILOGUE_4_ARGS
3538ENDPROC iemAImpl_fistt_r80_to_i64
3539
3540
3541
3542;
3543;---------------------- 32-bit floating point operations ----------------------
3544;
3545
3546;;
3547; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 32-bit floating point value to convert.
3552;
3553BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3554 PROLOGUE_3_ARGS
3555 sub xSP, 20h
3556
3557 fninit
3558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3559 fld dword [A2]
3560
3561 fnstsw word [A1 + IEMFPURESULT.FSW]
3562 fnclex
3563 fstp tword [A1 + IEMFPURESULT.r80Result]
3564
3565 fninit
3566 add xSP, 20h
3567 EPILOGUE_3_ARGS
3568ENDPROC iemAImpl_fld_r80_from_r32
3569
3570
3571;;
3572; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3573;
3574; @param A0 FPU context (fxsave).
3575; @param A1 Where to return the output FSW.
3576; @param A2 Where to store the 32-bit value.
3577; @param A3 Pointer to the 80-bit value.
3578;
3579BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3580 PROLOGUE_4_ARGS
3581 sub xSP, 20h
3582
3583 fninit
3584 fld tword [A3]
3585 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3586 fst dword [A2]
3587
3588 fnstsw word [A1]
3589
3590 fninit
3591 add xSP, 20h
3592 EPILOGUE_4_ARGS
3593ENDPROC iemAImpl_fst_r80_to_r32
3594
3595
3596;;
3597; FPU instruction working on one 80-bit and one 32-bit floating point value.
3598;
3599; @param 1 The instruction
3600;
3601; @param A0 FPU context (fxsave).
3602; @param A1 Pointer to a IEMFPURESULT for the output.
3603; @param A2 Pointer to the 80-bit value.
3604; @param A3 Pointer to the 32-bit value.
3605;
3606%macro IEMIMPL_FPU_R80_BY_R32 1
3607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3608 PROLOGUE_4_ARGS
3609 sub xSP, 20h
3610
3611 fninit
3612 fld tword [A2]
3613 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3614 %1 dword [A3]
3615
3616 fnstsw word [A1 + IEMFPURESULT.FSW]
3617 fnclex
3618 fstp tword [A1 + IEMFPURESULT.r80Result]
3619
3620 fninit
3621 add xSP, 20h
3622 EPILOGUE_4_ARGS
3623ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3624%endmacro
3625
3626IEMIMPL_FPU_R80_BY_R32 fadd
3627IEMIMPL_FPU_R80_BY_R32 fmul
3628IEMIMPL_FPU_R80_BY_R32 fsub
3629IEMIMPL_FPU_R80_BY_R32 fsubr
3630IEMIMPL_FPU_R80_BY_R32 fdiv
3631IEMIMPL_FPU_R80_BY_R32 fdivr
3632
3633
3634;;
3635; FPU instruction working on one 80-bit and one 32-bit floating point value,
3636; only returning FSW.
3637;
3638; @param 1 The instruction
3639;
3640; @param A0 FPU context (fxsave).
3641; @param A1 Where to store the output FSW.
3642; @param A2 Pointer to the 80-bit value.
3643; @param A3 Pointer to the 64-bit value.
3644;
3645%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3647 PROLOGUE_4_ARGS
3648 sub xSP, 20h
3649
3650 fninit
3651 fld tword [A2]
3652 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3653 %1 dword [A3]
3654
3655 fnstsw word [A1]
3656
3657 fninit
3658 add xSP, 20h
3659 EPILOGUE_4_ARGS
3660ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3661%endmacro
3662
3663IEMIMPL_FPU_R80_BY_R32_FSW fcom
3664
3665
3666
3667;
3668;---------------------- 64-bit floating point operations ----------------------
3669;
3670
3671;;
3672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3673;
3674; @param A0 FPU context (fxsave).
3675; @param A1 Pointer to a IEMFPURESULT for the output.
3676; @param A2 Pointer to the 64-bit floating point value to convert.
3677;
3678BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3679 PROLOGUE_3_ARGS
3680 sub xSP, 20h
3681
3682 fninit
3683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3684 fld qword [A2]
3685
3686 fnstsw word [A1 + IEMFPURESULT.FSW]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULT.r80Result]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_fld_r80_from_r64
3694
3695
3696;;
3697; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3698;
3699; @param A0 FPU context (fxsave).
3700; @param A1 Where to return the output FSW.
3701; @param A2 Where to store the 64-bit value.
3702; @param A3 Pointer to the 80-bit value.
3703;
3704BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3705 PROLOGUE_4_ARGS
3706 sub xSP, 20h
3707
3708 fninit
3709 fld tword [A3]
3710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3711 fst qword [A2]
3712
3713 fnstsw word [A1]
3714
3715 fninit
3716 add xSP, 20h
3717 EPILOGUE_4_ARGS
3718ENDPROC iemAImpl_fst_r80_to_r64
3719
3720
3721;;
3722; FPU instruction working on one 80-bit and one 64-bit floating point value.
3723;
3724; @param 1 The instruction
3725;
3726; @param A0 FPU context (fxsave).
3727; @param A1 Pointer to a IEMFPURESULT for the output.
3728; @param A2 Pointer to the 80-bit value.
3729; @param A3 Pointer to the 64-bit value.
3730;
3731%macro IEMIMPL_FPU_R80_BY_R64 1
3732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3733 PROLOGUE_4_ARGS
3734 sub xSP, 20h
3735
3736 fninit
3737 fld tword [A2]
3738 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3739 %1 qword [A3]
3740
3741 fnstsw word [A1 + IEMFPURESULT.FSW]
3742 fnclex
3743 fstp tword [A1 + IEMFPURESULT.r80Result]
3744
3745 fninit
3746 add xSP, 20h
3747 EPILOGUE_4_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3749%endmacro
3750
3751IEMIMPL_FPU_R80_BY_R64 fadd
3752IEMIMPL_FPU_R80_BY_R64 fmul
3753IEMIMPL_FPU_R80_BY_R64 fsub
3754IEMIMPL_FPU_R80_BY_R64 fsubr
3755IEMIMPL_FPU_R80_BY_R64 fdiv
3756IEMIMPL_FPU_R80_BY_R64 fdivr
3757
3758;;
3759; FPU instruction working on one 80-bit and one 64-bit floating point value,
3760; only returning FSW.
3761;
3762; @param 1 The instruction
3763;
3764; @param A0 FPU context (fxsave).
3765; @param A1 Where to store the output FSW.
3766; @param A2 Pointer to the 80-bit value.
3767; @param A3 Pointer to the 64-bit value.
3768;
3769%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3771 PROLOGUE_4_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 fld tword [A2]
3776 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3777 %1 qword [A3]
3778
3779 fnstsw word [A1]
3780
3781 fninit
3782 add xSP, 20h
3783 EPILOGUE_4_ARGS
3784ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3785%endmacro
3786
3787IEMIMPL_FPU_R80_BY_R64_FSW fcom
3788
3789
3790
3791;
3792;---------------------- 80-bit floating point operations ----------------------
3793;
3794
3795;;
3796; Loads a 80-bit floating point register value from memory.
3797;
3798; @param A0 FPU context (fxsave).
3799; @param A1 Pointer to a IEMFPURESULT for the output.
3800; @param A2 Pointer to the 80-bit floating point value to load.
3801;
3802BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3803 PROLOGUE_3_ARGS
3804 sub xSP, 20h
3805
3806 fninit
3807 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3808 fld tword [A2]
3809
3810 fnstsw word [A1 + IEMFPURESULT.FSW]
3811 fnclex
3812 fstp tword [A1 + IEMFPURESULT.r80Result]
3813
3814 fninit
3815 add xSP, 20h
3816 EPILOGUE_3_ARGS
3817ENDPROC iemAImpl_fld_r80_from_r80
3818
3819
3820;;
3821; Store a 80-bit floating point register to memory
3822;
3823; @param A0 FPU context (fxsave).
3824; @param A1 Where to return the output FSW.
3825; @param A2 Where to store the 80-bit value.
3826; @param A3 Pointer to the 80-bit register value.
3827;
3828BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3829 PROLOGUE_4_ARGS
3830 sub xSP, 20h
3831
3832 fninit
3833 fld tword [A3]
3834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3835 fstp tword [A2]
3836
3837 fnstsw word [A1]
3838
3839 fninit
3840 add xSP, 20h
3841 EPILOGUE_4_ARGS
3842ENDPROC iemAImpl_fst_r80_to_r80
3843
3844
3845;;
3846; Loads an 80-bit floating point register value in BCD format from memory.
3847;
3848; @param A0 FPU context (fxsave).
3849; @param A1 Pointer to a IEMFPURESULT for the output.
3850; @param A2 Pointer to the 80-bit BCD value to load.
3851;
3852BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3853 PROLOGUE_3_ARGS
3854 sub xSP, 20h
3855
3856 fninit
3857 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3858 fbld tword [A2]
3859
3860 fnstsw word [A1 + IEMFPURESULT.FSW]
3861 fnclex
3862 fstp tword [A1 + IEMFPURESULT.r80Result]
3863
3864 fninit
3865 add xSP, 20h
3866 EPILOGUE_3_ARGS
3867ENDPROC iemAImpl_fld_r80_from_d80
3868
3869
3870;;
3871; Store a 80-bit floating point register to memory as BCD
3872;
3873; @param A0 FPU context (fxsave).
3874; @param A1 Where to return the output FSW.
3875; @param A2 Where to store the 80-bit BCD value.
3876; @param A3 Pointer to the 80-bit register value.
3877;
3878BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3879 PROLOGUE_4_ARGS
3880 sub xSP, 20h
3881
3882 fninit
3883 fld tword [A3]
3884 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3885 fbstp tword [A2]
3886
3887 fnstsw word [A1]
3888
3889 fninit
3890 add xSP, 20h
3891 EPILOGUE_4_ARGS
3892ENDPROC iemAImpl_fst_r80_to_d80
3893
3894
3895;;
3896; FPU instruction working on two 80-bit floating point values.
3897;
3898; @param 1 The instruction
3899;
3900; @param A0 FPU context (fxsave).
3901; @param A1 Pointer to a IEMFPURESULT for the output.
3902; @param A2 Pointer to the first 80-bit value (ST0)
3903; @param A3 Pointer to the second 80-bit value (STn).
3904;
3905%macro IEMIMPL_FPU_R80_BY_R80 2
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3907 PROLOGUE_4_ARGS
3908 sub xSP, 20h
3909
3910 fninit
3911 fld tword [A3]
3912 fld tword [A2]
3913 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3914 %1 %2
3915
3916 fnstsw word [A1 + IEMFPURESULT.FSW]
3917 fnclex
3918 fstp tword [A1 + IEMFPURESULT.r80Result]
3919
3920 fninit
3921 add xSP, 20h
3922 EPILOGUE_4_ARGS
3923ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3924%endmacro
3925
3926IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3928IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3929IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3930IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3931IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3932IEMIMPL_FPU_R80_BY_R80 fprem, {}
3933IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3934IEMIMPL_FPU_R80_BY_R80 fscale, {}
3935
3936
3937;;
3938; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3939; storing the result in ST1 and popping the stack.
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 FPU context (fxsave).
3944; @param A1 Pointer to a IEMFPURESULT for the output.
3945; @param A2 Pointer to the first 80-bit value (ST1).
3946; @param A3 Pointer to the second 80-bit value (ST0).
3947;
3948%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3950 PROLOGUE_4_ARGS
3951 sub xSP, 20h
3952
3953 fninit
3954 fld tword [A2]
3955 fld tword [A3]
3956 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3957 %1
3958
3959 fnstsw word [A1 + IEMFPURESULT.FSW]
3960 fnclex
3961 fstp tword [A1 + IEMFPURESULT.r80Result]
3962
3963 fninit
3964 add xSP, 20h
3965 EPILOGUE_4_ARGS
3966ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3967%endmacro
3968
3969IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3970IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3971IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3972
3973
3974;;
3975; FPU instruction working on two 80-bit floating point values, only
3976; returning FSW.
3977;
3978; @param 1 The instruction
3979;
3980; @param A0 FPU context (fxsave).
3981; @param A1 Pointer to a uint16_t for the resulting FSW.
3982; @param A2 Pointer to the first 80-bit value.
3983; @param A3 Pointer to the second 80-bit value.
3984;
3985%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3987 PROLOGUE_4_ARGS
3988 sub xSP, 20h
3989
3990 fninit
3991 fld tword [A3]
3992 fld tword [A2]
3993 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3994 %1 st0, st1
3995
3996 fnstsw word [A1]
3997
3998 fninit
3999 add xSP, 20h
4000 EPILOGUE_4_ARGS
4001ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4002%endmacro
4003
4004IEMIMPL_FPU_R80_BY_R80_FSW fcom
4005IEMIMPL_FPU_R80_BY_R80_FSW fucom
4006
4007
4008;;
4009; FPU instruction working on two 80-bit floating point values,
4010; returning FSW and EFLAGS (eax).
4011;
4012; @param 1 The instruction
4013;
4014; @returns EFLAGS in EAX.
4015; @param A0 FPU context (fxsave).
4016; @param A1 Pointer to a uint16_t for the resulting FSW.
4017; @param A2 Pointer to the first 80-bit value.
4018; @param A3 Pointer to the second 80-bit value.
4019;
4020%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4022 PROLOGUE_4_ARGS
4023 sub xSP, 20h
4024
4025 fninit
4026 fld tword [A3]
4027 fld tword [A2]
4028 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4029 %1 st1
4030
4031 fnstsw word [A1]
4032 pushf
4033 pop xAX
4034
4035 fninit
4036 add xSP, 20h
4037 EPILOGUE_4_ARGS
4038ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4039%endmacro
4040
4041IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4042IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4043
4044
4045;;
4046; FPU instruction working on one 80-bit floating point value.
4047;
4048; @param 1 The instruction
4049;
4050; @param A0 FPU context (fxsave).
4051; @param A1 Pointer to a IEMFPURESULT for the output.
4052; @param A2 Pointer to the 80-bit value.
4053;
4054%macro IEMIMPL_FPU_R80 1
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4056 PROLOGUE_3_ARGS
4057 sub xSP, 20h
4058
4059 fninit
4060 fld tword [A2]
4061 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4062 %1
4063
4064 fnstsw word [A1 + IEMFPURESULT.FSW]
4065 fnclex
4066 fstp tword [A1 + IEMFPURESULT.r80Result]
4067
4068 fninit
4069 add xSP, 20h
4070 EPILOGUE_3_ARGS
4071ENDPROC iemAImpl_ %+ %1 %+ _r80
4072%endmacro
4073
4074IEMIMPL_FPU_R80 fchs
4075IEMIMPL_FPU_R80 fabs
4076IEMIMPL_FPU_R80 f2xm1
4077IEMIMPL_FPU_R80 fsqrt
4078IEMIMPL_FPU_R80 frndint
4079IEMIMPL_FPU_R80 fsin
4080IEMIMPL_FPU_R80 fcos
4081
4082
4083;;
4084; FPU instruction working on one 80-bit floating point value, only
4085; returning FSW.
4086;
4087; @param 1 The instruction
4088; @param 2 Non-zero to also restore FTW.
4089;
4090; @param A0 FPU context (fxsave).
4091; @param A1 Pointer to a uint16_t for the resulting FSW.
4092; @param A2 Pointer to the 80-bit value.
4093;
4094%macro IEMIMPL_FPU_R80_FSW 2
4095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4096 PROLOGUE_3_ARGS
4097 sub xSP, 20h
4098
4099 fninit
4100 fld tword [A2]
4101%if %2 != 0
4102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4103%else
4104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4105%endif
4106 %1
4107
4108 fnstsw word [A1]
4109
4110 fninit
4111 add xSP, 20h
4112 EPILOGUE_3_ARGS
4113ENDPROC iemAImpl_ %+ %1 %+ _r80
4114%endmacro
4115
4116IEMIMPL_FPU_R80_FSW ftst, 0
4117IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4118
4119
4120
4121;;
4122; FPU instruction loading a 80-bit floating point constant.
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 FPU context (fxsave).
4127; @param A1 Pointer to a IEMFPURESULT for the output.
4128;
4129%macro IEMIMPL_FPU_R80_CONST 1
4130BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4131 PROLOGUE_2_ARGS
4132 sub xSP, 20h
4133
4134 fninit
4135 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4136 %1
4137
4138 fnstsw word [A1 + IEMFPURESULT.FSW]
4139 fnclex
4140 fstp tword [A1 + IEMFPURESULT.r80Result]
4141
4142 fninit
4143 add xSP, 20h
4144 EPILOGUE_2_ARGS
4145ENDPROC iemAImpl_ %+ %1 %+
4146%endmacro
4147
4148IEMIMPL_FPU_R80_CONST fld1
4149IEMIMPL_FPU_R80_CONST fldl2t
4150IEMIMPL_FPU_R80_CONST fldl2e
4151IEMIMPL_FPU_R80_CONST fldpi
4152IEMIMPL_FPU_R80_CONST fldlg2
4153IEMIMPL_FPU_R80_CONST fldln2
4154IEMIMPL_FPU_R80_CONST fldz
4155
4156
4157;;
4158; FPU instruction working on one 80-bit floating point value, outputing two.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 FPU context (fxsave).
4163; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4164; @param A2 Pointer to the 80-bit value.
4165;
4166%macro IEMIMPL_FPU_R80_R80 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4168 PROLOGUE_3_ARGS
4169 sub xSP, 20h
4170
4171 fninit
4172 fld tword [A2]
4173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4174 %1
4175
4176 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4177 fnclex
4178 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4179 fnclex
4180 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4181
4182 fninit
4183 add xSP, 20h
4184 EPILOGUE_3_ARGS
4185ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4186%endmacro
4187
4188IEMIMPL_FPU_R80_R80 fptan
4189IEMIMPL_FPU_R80_R80 fxtract
4190IEMIMPL_FPU_R80_R80 fsincos
4191
4192
4193
4194
4195;---------------------- SSE and MMX Operations ----------------------
4196
4197;; @todo what do we need to do for MMX?
4198%macro IEMIMPL_MMX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_MMX_EPILOGUE 0
4201%endmacro
4202
4203;; @todo what do we need to do for SSE?
4204%macro IEMIMPL_SSE_PROLOGUE 0
4205%endmacro
4206%macro IEMIMPL_SSE_EPILOGUE 0
4207%endmacro
4208
4209;; @todo what do we need to do for AVX?
4210%macro IEMIMPL_AVX_PROLOGUE 0
4211%endmacro
4212%macro IEMIMPL_AVX_EPILOGUE 0
4213%endmacro
4214
4215
4216;;
4217; Media instruction working on two full sized registers.
4218;
4219; @param 1 The instruction
4220; @param 2 Whether there is an MMX variant (1) or not (0).
4221;
4222; @param A0 FPU context (fxsave).
4223; @param A1 Pointer to the first media register size operand (input/output).
4224; @param A2 Pointer to the second media register size operand (input).
4225;
4226; @todo r=aeichner Currently unused, can probably be removed.
4227;
4228%macro IEMIMPL_MEDIA_F2 2
4229%if %2 != 0
4230BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4231 PROLOGUE_3_ARGS
4232 IEMIMPL_MMX_PROLOGUE
4233
4234 movq mm0, [A1]
4235 movq mm1, [A2]
4236 %1 mm0, mm1
4237 movq [A1], mm0
4238
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_3_ARGS
4241ENDPROC iemAImpl_ %+ %1 %+ _u64
4242%endif
4243
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_SSE_PROLOGUE
4247
4248 movdqu xmm0, [A1]
4249 movdqu xmm1, [A2]
4250 %1 xmm0, xmm1
4251 movdqu [A1], xmm0
4252
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256%endmacro
4257
4258;;
4259; Media instruction working on two full sized registers, but no FXSAVE state argument.
4260;
4261; @param 1 The instruction
4262; @param 2 Whether there is an MMX variant (1) or not (0).
4263;
4264; @param A0 Pointer to the first media register size operand (input/output).
4265; @param A1 Pointer to the second media register size operand (input).
4266;
4267%macro IEMIMPL_MEDIA_OPT_F2 2
4268%if %2 != 0
4269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4270 PROLOGUE_2_ARGS
4271 IEMIMPL_MMX_PROLOGUE
4272
4273 movq mm0, [A0]
4274 movq mm1, [A1]
4275 %1 mm0, mm1
4276 movq [A0], mm0
4277
4278 IEMIMPL_MMX_EPILOGUE
4279 EPILOGUE_2_ARGS
4280ENDPROC iemAImpl_ %+ %1 %+ _u64
4281%endif
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4284 PROLOGUE_2_ARGS
4285 IEMIMPL_SSE_PROLOGUE
4286
4287 movdqu xmm0, [A0]
4288 movdqu xmm1, [A1]
4289 %1 xmm0, xmm1
4290 movdqu [A0], xmm0
4291
4292 IEMIMPL_SSE_EPILOGUE
4293 EPILOGUE_2_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295%endmacro
4296
4297IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4298IEMIMPL_MEDIA_OPT_F2 pand, 1
4299IEMIMPL_MEDIA_OPT_F2 pandn, 1
4300IEMIMPL_MEDIA_OPT_F2 por, 1
4301IEMIMPL_MEDIA_OPT_F2 pxor, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4305IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4306IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4307IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4308IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4309IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4310IEMIMPL_MEDIA_OPT_F2 paddb, 1
4311IEMIMPL_MEDIA_OPT_F2 paddw, 1
4312IEMIMPL_MEDIA_OPT_F2 paddd, 1
4313IEMIMPL_MEDIA_OPT_F2 paddq, 1
4314IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4315IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4316IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4317IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4318IEMIMPL_MEDIA_OPT_F2 psubb, 1
4319IEMIMPL_MEDIA_OPT_F2 psubw, 1
4320IEMIMPL_MEDIA_OPT_F2 psubd, 1
4321IEMIMPL_MEDIA_OPT_F2 psubq, 1
4322IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4323IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4324IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4325IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4326IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4327IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4328IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4330IEMIMPL_MEDIA_OPT_F2 pminub, 1
4331IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4332IEMIMPL_MEDIA_OPT_F2 pminud, 0
4333IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4334IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4335IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4336IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4337IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4338IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4339IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4340IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4341IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4342IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4343IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4345IEMIMPL_MEDIA_OPT_F2 psignb, 1
4346IEMIMPL_MEDIA_OPT_F2 psignw, 1
4347IEMIMPL_MEDIA_OPT_F2 psignd, 1
4348IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4349IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4350IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4351IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4352IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4353IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4354IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4355IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4356IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4357IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4358IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4359IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4360IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4361IEMIMPL_MEDIA_OPT_F2 psllw, 1
4362IEMIMPL_MEDIA_OPT_F2 pslld, 1
4363IEMIMPL_MEDIA_OPT_F2 psllq, 1
4364IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4365IEMIMPL_MEDIA_OPT_F2 psrld, 1
4366IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4367IEMIMPL_MEDIA_OPT_F2 psraw, 1
4368IEMIMPL_MEDIA_OPT_F2 psrad, 1
4369IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4370IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4371IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4372IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4373IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4374IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4375IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4376IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4377IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4378IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4379IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4380IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4381IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4382IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4383IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4384IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4385IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4386IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4387IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4388IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4389
4390;;
4391; Media instruction working on one full sized and one half sized register (lower half).
4392;
4393; @param 1 The instruction
4394; @param 2 1 if MMX is included, 0 if not.
4395;
4396; @param A0 Pointer to the first full sized media register operand (input/output).
4397; @param A1 Pointer to the second half sized media register operand (input).
4398;
4399%macro IEMIMPL_MEDIA_F1L1 2
4400 %if %2 != 0
4401BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4402 PROLOGUE_2_ARGS
4403 IEMIMPL_MMX_PROLOGUE
4404
4405 movq mm0, [A0]
4406 movq mm1, [A1]
4407 %1 mm0, mm1
4408 movq [A0], mm0
4409
4410 IEMIMPL_MMX_EPILOGUE
4411 EPILOGUE_2_ARGS
4412ENDPROC iemAImpl_ %+ %1 %+ _u64
4413 %endif
4414
4415BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4416 PROLOGUE_2_ARGS
4417 IEMIMPL_SSE_PROLOGUE
4418
4419 movdqu xmm0, [A0]
4420 movdqu xmm1, [A1]
4421 %1 xmm0, xmm1
4422 movdqu [A0], xmm0
4423
4424 IEMIMPL_SSE_EPILOGUE
4425 EPILOGUE_2_ARGS
4426ENDPROC iemAImpl_ %+ %1 %+ _u128
4427%endmacro
4428
4429IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4430IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4431IEMIMPL_MEDIA_F1L1 punpckldq, 1
4432IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4433
4434
4435;;
4436; Media instruction working two half sized input registers (lower half) and a full sized
4437; destination register (vpunpckh*).
4438;
4439; @param 1 The instruction
4440;
4441; @param A0 Pointer to the destination register (full sized, output only).
4442; @param A1 Pointer to the first full sized media source register operand, where we
4443; will only use the lower half as input - but we'll be loading it in full.
4444; @param A2 Pointer to the second full sized media source register operand, where we
4445; will only use the lower half as input - but we'll be loading it in full.
4446;
4447%macro IEMIMPL_MEDIA_F1L1L1 1
4448BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4449 PROLOGUE_3_ARGS
4450 IEMIMPL_AVX_PROLOGUE
4451
4452 vmovdqu xmm0, [A1]
4453 vmovdqu xmm1, [A2]
4454 %1 xmm0, xmm0, xmm1
4455 vmovdqu [A0], xmm0
4456
4457 IEMIMPL_AVX_PROLOGUE
4458 EPILOGUE_3_ARGS
4459ENDPROC iemAImpl_ %+ %1 %+ _u128
4460
4461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4462 PROLOGUE_3_ARGS
4463 IEMIMPL_AVX_PROLOGUE
4464
4465 vmovdqu ymm0, [A1]
4466 vmovdqu ymm1, [A2]
4467 %1 ymm0, ymm0, ymm1
4468 vmovdqu [A0], ymm0
4469
4470 IEMIMPL_AVX_PROLOGUE
4471 EPILOGUE_3_ARGS
4472ENDPROC iemAImpl_ %+ %1 %+ _u256
4473%endmacro
4474
4475IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4476IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4477IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4478IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4479
4480
4481;;
4482; Media instruction working on one full sized and one half sized register (high half).
4483;
4484; @param 1 The instruction
4485; @param 2 1 if MMX is included, 0 if not.
4486;
4487; @param A0 Pointer to the first full sized media register operand (input/output).
4488; @param A1 Pointer to the second full sized media register operand, where we
4489; will only use the upper half as input - but we'll load it in full.
4490;
4491%macro IEMIMPL_MEDIA_F1H1 2
4492IEMIMPL_MEDIA_F1L1 %1, %2
4493%endmacro
4494
4495IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4496IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4497IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4498IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4499
4500
4501;;
4502; Media instruction working two half sized input registers (high half) and a full sized
4503; destination register (vpunpckh*).
4504;
4505; @param 1 The instruction
4506;
4507; @param A0 Pointer to the destination register (full sized, output only).
4508; @param A1 Pointer to the first full sized media source register operand, where we
4509; will only use the upper half as input - but we'll be loading it in full.
4510; @param A2 Pointer to the second full sized media source register operand, where we
4511; will only use the upper half as input - but we'll be loading it in full.
4512;
4513%macro IEMIMPL_MEDIA_F1H1H1 1
4514IEMIMPL_MEDIA_F1L1L1 %1
4515%endmacro
4516
4517IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4518IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4519IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4520IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4521
4522
4523;
4524; Shufflers with evil 8-bit immediates.
4525;
4526
4527BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4528 PROLOGUE_3_ARGS
4529 IEMIMPL_MMX_PROLOGUE
4530
4531 movzx A2, A2_8 ; must clear top bits
4532 movq mm1, [A1]
4533 movq mm0, mm0 ; paranoia!
4534 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4535 movq [A0], mm0
4536
4537 IEMIMPL_MMX_EPILOGUE
4538 EPILOGUE_3_ARGS
4539%assign bImm 0
4540%rep 256
4541.imm %+ bImm:
4542 IBT_ENDBRxx_WITHOUT_NOTRACK
4543 pshufw mm0, mm1, bImm
4544 ret
4545 %assign bImm bImm + 1
4546%endrep
4547.immEnd:
4548ENDPROC iemAImpl_pshufw_u64
4549
4550
4551%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4553 PROLOGUE_3_ARGS
4554 IEMIMPL_SSE_PROLOGUE
4555
4556 movzx A2, A2_8 ; must clear top bits
4557 movdqu xmm1, [A1]
4558 movdqu xmm0, xmm1 ; paranoia!
4559 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4560 movdqu [A0], xmm0
4561
4562 IEMIMPL_SSE_EPILOGUE
4563 EPILOGUE_3_ARGS
4564
4565 %assign bImm 0
4566 %rep 256
4567.imm %+ bImm:
4568 IBT_ENDBRxx_WITHOUT_NOTRACK
4569 %1 xmm0, xmm1, bImm
4570 ret
4571 %assign bImm bImm + 1
4572 %endrep
4573.immEnd:
4574ENDPROC iemAImpl_ %+ %1 %+ _u128
4575%endmacro
4576
4577IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4578IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4579IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4580
4581
4582%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4584 PROLOGUE_3_ARGS
4585 IEMIMPL_SSE_PROLOGUE
4586
4587 movzx A2, A2_8 ; must clear top bits
4588 vmovdqu ymm1, [A1]
4589 vmovdqu ymm0, ymm1 ; paranoia!
4590 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4591 vmovdqu [A0], ymm0
4592
4593 IEMIMPL_SSE_EPILOGUE
4594 EPILOGUE_3_ARGS
4595 %assign bImm 0
4596 %rep 256
4597.imm %+ bImm:
4598 IBT_ENDBRxx_WITHOUT_NOTRACK
4599 %1 ymm0, ymm1, bImm
4600 ret
4601 %assign bImm bImm + 1
4602 %endrep
4603.immEnd:
4604ENDPROC iemAImpl_ %+ %1 %+ _u256
4605%endmacro
4606
4607IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4608IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4609IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4610
4611
4612;
4613; Shifts with evil 8-bit immediates.
4614;
4615
4616%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4618 PROLOGUE_2_ARGS
4619 IEMIMPL_MMX_PROLOGUE
4620
4621 movzx A1, A1_8 ; must clear top bits
4622 movq mm0, [A0]
4623 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4624 movq [A0], mm0
4625
4626 IEMIMPL_MMX_EPILOGUE
4627 EPILOGUE_2_ARGS
4628%assign bImm 0
4629%rep 256
4630.imm %+ bImm:
4631 IBT_ENDBRxx_WITHOUT_NOTRACK
4632 %1 mm0, bImm
4633 ret
4634 %assign bImm bImm + 1
4635%endrep
4636.immEnd:
4637ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4638%endmacro
4639
4640IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4641IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4642IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4644IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4645IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4646IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4647IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4648
4649
4650%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4651BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4652 PROLOGUE_2_ARGS
4653 IEMIMPL_SSE_PROLOGUE
4654
4655 movzx A1, A1_8 ; must clear top bits
4656 movdqu xmm0, [A0]
4657 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4658 movdqu [A0], xmm0
4659
4660 IEMIMPL_SSE_EPILOGUE
4661 EPILOGUE_2_ARGS
4662 %assign bImm 0
4663 %rep 256
4664.imm %+ bImm:
4665 IBT_ENDBRxx_WITHOUT_NOTRACK
4666 %1 xmm0, bImm
4667 ret
4668 %assign bImm bImm + 1
4669 %endrep
4670.immEnd:
4671ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4672%endmacro
4673
4674IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4675IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4676IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4678IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4680IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4681IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4682IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4683IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4684
4685
4686;
4687; Move byte mask.
4688;
4689
4690BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4691 PROLOGUE_2_ARGS
4692 IEMIMPL_MMX_PROLOGUE
4693
4694 movq mm1, [A1]
4695 pmovmskb T0, mm1
4696 mov [A0], T0
4697%ifdef RT_ARCH_X86
4698 mov dword [A0 + 4], 0
4699%endif
4700 IEMIMPL_MMX_EPILOGUE
4701 EPILOGUE_2_ARGS
4702ENDPROC iemAImpl_pmovmskb_u64
4703
4704BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4705 PROLOGUE_2_ARGS
4706 IEMIMPL_SSE_PROLOGUE
4707
4708 movdqu xmm1, [A1]
4709 pmovmskb T0, xmm1
4710 mov [A0], T0
4711%ifdef RT_ARCH_X86
4712 mov dword [A0 + 4], 0
4713%endif
4714 IEMIMPL_SSE_EPILOGUE
4715 EPILOGUE_2_ARGS
4716ENDPROC iemAImpl_pmovmskb_u128
4717
4718BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4719 PROLOGUE_2_ARGS
4720 IEMIMPL_AVX_PROLOGUE
4721
4722 vmovdqu ymm1, [A1]
4723 vpmovmskb T0, ymm1
4724 mov [A0], T0
4725%ifdef RT_ARCH_X86
4726 mov dword [A0 + 4], 0
4727%endif
4728 IEMIMPL_AVX_EPILOGUE
4729 EPILOGUE_2_ARGS
4730ENDPROC iemAImpl_vpmovmskb_u256
4731
4732
4733;;
4734; Media instruction working on two full sized source registers and one destination (AVX).
4735;
4736; @param 1 The instruction
4737;
4738; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4739; @param A1 Pointer to the destination media register size operand (output).
4740; @param A2 Pointer to the first source media register size operand (input).
4741; @param A3 Pointer to the second source media register size operand (input).
4742;
4743; @todo r=aeichner Not used right now
4744;
4745%macro IEMIMPL_MEDIA_F3 1
4746BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4747 PROLOGUE_4_ARGS
4748 IEMIMPL_AVX_PROLOGUE
4749
4750 vmovdqu xmm0, [A2]
4751 vmovdqu xmm1, [A3]
4752 %1 xmm0, xmm0, xmm1
4753 vmovdqu [A1], xmm0
4754
4755 IEMIMPL_AVX_PROLOGUE
4756 EPILOGUE_4_ARGS
4757ENDPROC iemAImpl_ %+ %1 %+ _u128
4758
4759BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4760 PROLOGUE_4_ARGS
4761 IEMIMPL_AVX_PROLOGUE
4762
4763 vmovdqu ymm0, [A2]
4764 vmovdqu ymm1, [A3]
4765 %1 ymm0, ymm0, ymm1
4766 vmovdqu [A1], ymm0
4767
4768 IEMIMPL_AVX_PROLOGUE
4769 EPILOGUE_4_ARGS
4770ENDPROC iemAImpl_ %+ %1 %+ _u256
4771%endmacro
4772
4773;;
4774; Media instruction working on two full sized source registers and one destination (AVX),
4775; but no XSAVE state pointer argument.
4776;
4777; @param 1 The instruction
4778;
4779; @param A0 Pointer to the destination media register size operand (output).
4780; @param A1 Pointer to the first source media register size operand (input).
4781; @param A2 Pointer to the second source media register size operand (input).
4782;
4783%macro IEMIMPL_MEDIA_OPT_F3 1
4784BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4785 PROLOGUE_3_ARGS
4786 IEMIMPL_AVX_PROLOGUE
4787
4788 vmovdqu xmm0, [A1]
4789 vmovdqu xmm1, [A2]
4790 %1 xmm0, xmm0, xmm1
4791 vmovdqu [A0], xmm0
4792
4793 IEMIMPL_AVX_PROLOGUE
4794 EPILOGUE_3_ARGS
4795ENDPROC iemAImpl_ %+ %1 %+ _u128
4796
4797BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4798 PROLOGUE_3_ARGS
4799 IEMIMPL_AVX_PROLOGUE
4800
4801 vmovdqu ymm0, [A1]
4802 vmovdqu ymm1, [A2]
4803 %1 ymm0, ymm0, ymm1
4804 vmovdqu [A0], ymm0
4805
4806 IEMIMPL_AVX_PROLOGUE
4807 EPILOGUE_3_ARGS
4808ENDPROC iemAImpl_ %+ %1 %+ _u256
4809%endmacro
4810
4811IEMIMPL_MEDIA_OPT_F3 vpshufb
4812IEMIMPL_MEDIA_OPT_F3 vpand
4813IEMIMPL_MEDIA_OPT_F3 vpminub
4814IEMIMPL_MEDIA_OPT_F3 vpminuw
4815IEMIMPL_MEDIA_OPT_F3 vpminud
4816IEMIMPL_MEDIA_OPT_F3 vpminsb
4817IEMIMPL_MEDIA_OPT_F3 vpminsw
4818IEMIMPL_MEDIA_OPT_F3 vpminsd
4819IEMIMPL_MEDIA_OPT_F3 vpmaxub
4820IEMIMPL_MEDIA_OPT_F3 vpmaxuw
4821IEMIMPL_MEDIA_OPT_F3 vpmaxud
4822IEMIMPL_MEDIA_OPT_F3 vpmaxsb
4823IEMIMPL_MEDIA_OPT_F3 vpmaxsw
4824IEMIMPL_MEDIA_OPT_F3 vpmaxsd
4825IEMIMPL_MEDIA_OPT_F3 vpandn
4826IEMIMPL_MEDIA_OPT_F3 vpor
4827IEMIMPL_MEDIA_OPT_F3 vpxor
4828IEMIMPL_MEDIA_OPT_F3 vpcmpeqb
4829IEMIMPL_MEDIA_OPT_F3 vpcmpeqw
4830IEMIMPL_MEDIA_OPT_F3 vpcmpeqd
4831IEMIMPL_MEDIA_OPT_F3 vpcmpeqq
4832IEMIMPL_MEDIA_OPT_F3 vpcmpgtb
4833IEMIMPL_MEDIA_OPT_F3 vpcmpgtw
4834IEMIMPL_MEDIA_OPT_F3 vpcmpgtd
4835IEMIMPL_MEDIA_OPT_F3 vpcmpgtq
4836IEMIMPL_MEDIA_OPT_F3 vpaddb
4837IEMIMPL_MEDIA_OPT_F3 vpaddw
4838IEMIMPL_MEDIA_OPT_F3 vpaddd
4839IEMIMPL_MEDIA_OPT_F3 vpaddq
4840IEMIMPL_MEDIA_OPT_F3 vpsubb
4841IEMIMPL_MEDIA_OPT_F3 vpsubw
4842IEMIMPL_MEDIA_OPT_F3 vpsubd
4843IEMIMPL_MEDIA_OPT_F3 vpsubq
4844IEMIMPL_MEDIA_OPT_F3 vpacksswb
4845IEMIMPL_MEDIA_OPT_F3 vpackssdw
4846IEMIMPL_MEDIA_OPT_F3 vpackuswb
4847IEMIMPL_MEDIA_OPT_F3 vpackusdw
4848IEMIMPL_MEDIA_OPT_F3 vpmullw
4849IEMIMPL_MEDIA_OPT_F3 vpmulld
4850IEMIMPL_MEDIA_OPT_F3 vpmulhw
4851IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4852IEMIMPL_MEDIA_OPT_F3 vpavgb
4853IEMIMPL_MEDIA_OPT_F3 vpavgw
4854IEMIMPL_MEDIA_OPT_F3 vpsignb
4855IEMIMPL_MEDIA_OPT_F3 vpsignw
4856IEMIMPL_MEDIA_OPT_F3 vpsignd
4857IEMIMPL_MEDIA_OPT_F3 vphaddw
4858IEMIMPL_MEDIA_OPT_F3 vphaddd
4859IEMIMPL_MEDIA_OPT_F3 vphsubw
4860IEMIMPL_MEDIA_OPT_F3 vphsubd
4861IEMIMPL_MEDIA_OPT_F3 vphaddsw
4862IEMIMPL_MEDIA_OPT_F3 vphsubsw
4863IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4864IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4865IEMIMPL_MEDIA_OPT_F3 vpsadbw
4866IEMIMPL_MEDIA_OPT_F3 vpmuldq
4867IEMIMPL_MEDIA_OPT_F3 vpmuludq
4868IEMIMPL_MEDIA_OPT_F3 vunpcklps
4869IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4870IEMIMPL_MEDIA_OPT_F3 vunpckhps
4871IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4872IEMIMPL_MEDIA_OPT_F3 vpsubsb
4873IEMIMPL_MEDIA_OPT_F3 vpsubsw
4874IEMIMPL_MEDIA_OPT_F3 vpsubusb
4875IEMIMPL_MEDIA_OPT_F3 vpsubusw
4876IEMIMPL_MEDIA_OPT_F3 vpaddusb
4877IEMIMPL_MEDIA_OPT_F3 vpaddusw
4878IEMIMPL_MEDIA_OPT_F3 vpaddsb
4879IEMIMPL_MEDIA_OPT_F3 vpaddsw
4880IEMIMPL_MEDIA_OPT_F3 vpermilps
4881IEMIMPL_MEDIA_OPT_F3 vpermilpd
4882IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4883IEMIMPL_MEDIA_OPT_F3 vpsrlvd
4884IEMIMPL_MEDIA_OPT_F3 vpsrlvq
4885IEMIMPL_MEDIA_OPT_F3 vpsravd
4886IEMIMPL_MEDIA_OPT_F3 vpsllvd
4887IEMIMPL_MEDIA_OPT_F3 vpsllvq
4888
4889;;
4890; Media instruction working on one full sized source register, one full sized destination
4891; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4892; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4893; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4894; of either 16, 32, or 64, it acts like the max shift size)
4895;
4896; @param 1 The instruction
4897;
4898; @param A0 Pointer to the destination media register size operand (output).
4899; @param A1 Pointer to the first source media register size operand (input).
4900; @param A2 Pointer to the second source media register size operand (input).
4901;
4902%macro IEMIMPL_SHIFT_OPT_F3 1
4903BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4904 PROLOGUE_3_ARGS
4905 IEMIMPL_AVX_PROLOGUE
4906
4907 vmovdqu xmm0, [A1]
4908 vmovdqu xmm1, [A2]
4909 %1 xmm0, xmm0, xmm1
4910 vmovdqu [A0], xmm0
4911
4912 IEMIMPL_AVX_PROLOGUE
4913 EPILOGUE_3_ARGS
4914ENDPROC iemAImpl_ %+ %1 %+ _u128
4915
4916BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4917 PROLOGUE_3_ARGS
4918 IEMIMPL_AVX_PROLOGUE
4919
4920 vmovdqu ymm0, [A1]
4921 vmovdqu xmm1, [A2]
4922 %1 ymm0, ymm0, xmm1
4923 vmovdqu [A0], ymm0
4924
4925 IEMIMPL_AVX_PROLOGUE
4926 EPILOGUE_3_ARGS
4927ENDPROC iemAImpl_ %+ %1 %+ _u256
4928%endmacro
4929
4930IEMIMPL_SHIFT_OPT_F3 vpsllw
4931IEMIMPL_SHIFT_OPT_F3 vpslld
4932IEMIMPL_SHIFT_OPT_F3 vpsllq
4933IEMIMPL_SHIFT_OPT_F3 vpsraw
4934IEMIMPL_SHIFT_OPT_F3 vpsrad
4935IEMIMPL_SHIFT_OPT_F3 vpsrlw
4936IEMIMPL_SHIFT_OPT_F3 vpsrld
4937IEMIMPL_SHIFT_OPT_F3 vpsrlq
4938
4939
4940;;
4941; Media instruction working on one full sized source registers and one destination (AVX),
4942; but no XSAVE state pointer argument.
4943;
4944; @param 1 The instruction
4945; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4946;
4947; @param A0 Pointer to the destination media register size operand (output).
4948; @param A1 Pointer to the source media register size operand (input).
4949;
4950%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4952 PROLOGUE_2_ARGS
4953 IEMIMPL_AVX_PROLOGUE
4954
4955 vmovdqu xmm0, [A1]
4956 %1 xmm0, xmm0
4957 vmovdqu [A0], xmm0
4958
4959 IEMIMPL_AVX_PROLOGUE
4960 EPILOGUE_2_ARGS
4961ENDPROC iemAImpl_ %+ %1 %+ _u128
4962
4963 %if %2 == 1
4964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4965 PROLOGUE_2_ARGS
4966 IEMIMPL_AVX_PROLOGUE
4967
4968 vmovdqu ymm0, [A1]
4969 %1 ymm0, ymm0
4970 vmovdqu [A0], ymm0
4971
4972 IEMIMPL_AVX_PROLOGUE
4973 EPILOGUE_2_ARGS
4974ENDPROC iemAImpl_ %+ %1 %+ _u256
4975 %endif
4976%endmacro
4977
4978IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4979IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4980IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4981IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4982
4983
4984;
4985; The SSE 4.2 crc32
4986;
4987; @param A1 Pointer to the 32-bit destination.
4988; @param A2 The source operand, sized according to the suffix.
4989;
4990BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4991 PROLOGUE_2_ARGS
4992
4993 mov T0_32, [A0]
4994 crc32 T0_32, A1_8
4995 mov [A0], T0_32
4996
4997 EPILOGUE_2_ARGS
4998ENDPROC iemAImpl_crc32_u8
4999
5000BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5001 PROLOGUE_2_ARGS
5002
5003 mov T0_32, [A0]
5004 crc32 T0_32, A1_16
5005 mov [A0], T0_32
5006
5007 EPILOGUE_2_ARGS
5008ENDPROC iemAImpl_crc32_u16
5009
5010BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5011 PROLOGUE_2_ARGS
5012
5013 mov T0_32, [A0]
5014 crc32 T0_32, A1_32
5015 mov [A0], T0_32
5016
5017 EPILOGUE_2_ARGS
5018ENDPROC iemAImpl_crc32_u32
5019
5020%ifdef RT_ARCH_AMD64
5021BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5022 PROLOGUE_2_ARGS
5023
5024 mov T0_32, [A0]
5025 crc32 T0, A1
5026 mov [A0], T0_32
5027
5028 EPILOGUE_2_ARGS
5029ENDPROC iemAImpl_crc32_u64
5030%endif
5031
5032
5033;
5034; PTEST (SSE 4.1)
5035;
5036; @param A0 Pointer to the first source operand (aka readonly destination).
5037; @param A1 Pointer to the second source operand.
5038; @param A2 Pointer to the EFLAGS register.
5039;
5040BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5041 PROLOGUE_3_ARGS
5042 IEMIMPL_SSE_PROLOGUE
5043
5044 movdqu xmm0, [A0]
5045 movdqu xmm1, [A1]
5046 ptest xmm0, xmm1
5047 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5048
5049 IEMIMPL_SSE_EPILOGUE
5050 EPILOGUE_3_ARGS
5051ENDPROC iemAImpl_ptest_u128
5052
5053BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5054 PROLOGUE_3_ARGS
5055 IEMIMPL_SSE_PROLOGUE
5056
5057 vmovdqu ymm0, [A0]
5058 vmovdqu ymm1, [A1]
5059 vptest ymm0, ymm1
5060 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5061
5062 IEMIMPL_SSE_EPILOGUE
5063 EPILOGUE_3_ARGS
5064ENDPROC iemAImpl_vptest_u256
5065
5066
5067;; Template for the vtestp{s,d} instructions
5068;
5069; @param 1 The instruction
5070;
5071; @param A0 Pointer to the first source operand (aka readonly destination).
5072; @param A1 Pointer to the second source operand.
5073; @param A2 Pointer to the EFLAGS register.
5074;
5075%macro IEMIMPL_VTESTP_S_D 1
5076BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5077 PROLOGUE_3_ARGS
5078 IEMIMPL_AVX_PROLOGUE
5079
5080 vmovdqu xmm0, [A0]
5081 vmovdqu xmm1, [A1]
5082 %1 xmm0, xmm1
5083 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5084
5085 IEMIMPL_AVX_EPILOGUE
5086 EPILOGUE_3_ARGS
5087ENDPROC iemAImpl_ %+ %1 %+ _u128
5088
5089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5090 PROLOGUE_3_ARGS
5091 IEMIMPL_AVX_PROLOGUE
5092
5093 vmovdqu ymm0, [A0]
5094 vmovdqu ymm1, [A1]
5095 %1 ymm0, ymm1
5096 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5097
5098 IEMIMPL_AVX_EPILOGUE
5099 EPILOGUE_3_ARGS
5100ENDPROC iemAImpl_ %+ %1 %+ _u256
5101%endmacro
5102
5103IEMIMPL_VTESTP_S_D vtestps
5104IEMIMPL_VTESTP_S_D vtestpd
5105
5106
5107;;
5108; Template for the [v]pmov{s,z}x* instructions
5109;
5110; @param 1 The instruction
5111;
5112; @param A0 Pointer to the destination media register size operand (output).
5113; @param A1 The source operand value (input).
5114;
5115%macro IEMIMPL_V_PMOV_SZ_X 1
5116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5117 PROLOGUE_2_ARGS
5118 IEMIMPL_SSE_PROLOGUE
5119
5120 movd xmm0, A1
5121 %1 xmm0, xmm0
5122 vmovdqu [A0], xmm0
5123
5124 IEMIMPL_SSE_PROLOGUE
5125 EPILOGUE_2_ARGS
5126ENDPROC iemAImpl_ %+ %1 %+ _u128
5127
5128BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5129 PROLOGUE_2_ARGS
5130 IEMIMPL_AVX_PROLOGUE
5131
5132 movd xmm0, A1
5133 v %+ %1 xmm0, xmm0
5134 vmovdqu [A0], xmm0
5135
5136 IEMIMPL_AVX_PROLOGUE
5137 EPILOGUE_2_ARGS
5138ENDPROC iemAImpl_v %+ %1 %+ _u128
5139
5140BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5141 PROLOGUE_2_ARGS
5142 IEMIMPL_AVX_PROLOGUE
5143
5144 movdqu xmm0, [A1]
5145 v %+ %1 ymm0, xmm0
5146 vmovdqu [A0], ymm0
5147
5148 IEMIMPL_AVX_PROLOGUE
5149 EPILOGUE_2_ARGS
5150ENDPROC iemAImpl_v %+ %1 %+ _u256
5151%endmacro
5152
5153IEMIMPL_V_PMOV_SZ_X pmovsxbw
5154IEMIMPL_V_PMOV_SZ_X pmovsxbd
5155IEMIMPL_V_PMOV_SZ_X pmovsxbq
5156IEMIMPL_V_PMOV_SZ_X pmovsxwd
5157IEMIMPL_V_PMOV_SZ_X pmovsxwq
5158IEMIMPL_V_PMOV_SZ_X pmovsxdq
5159
5160IEMIMPL_V_PMOV_SZ_X pmovzxbw
5161IEMIMPL_V_PMOV_SZ_X pmovzxbd
5162IEMIMPL_V_PMOV_SZ_X pmovzxbq
5163IEMIMPL_V_PMOV_SZ_X pmovzxwd
5164IEMIMPL_V_PMOV_SZ_X pmovzxwq
5165IEMIMPL_V_PMOV_SZ_X pmovzxdq
5166
5167
5168;;
5169; Initialize the SSE MXCSR register using the guest value partially to
5170; account for rounding mode, load the value from the given register.
5171;
5172; @uses 4 bytes of stack to save the original value, T0.
5173; @param 1 Expression giving the register holding the guest's MXCSR.
5174;
5175%macro SSE_AVX_LD_MXCSR 1
5176 sub xSP, 4
5177
5178 stmxcsr [xSP]
5179 mov T0_32, %1
5180 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5181 or T0_32, X86_MXCSR_XCPT_MASK
5182 sub xSP, 4
5183 mov [xSP], T0_32
5184 ldmxcsr [xSP]
5185 add xSP, 4
5186%endmacro
5187
5188
5189;;
5190; Restores the SSE MXCSR register with the original value.
5191;
5192; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5193; @param 1 Expression giving the register to return the new guest's MXCSR value.
5194; @param 2 Expression giving the register holding original guest's MXCSR value.
5195;
5196; @note Restores the stack pointer.
5197;
5198%macro SSE_AVX_ST_MXCSR 2
5199 sub xSP, 4
5200 stmxcsr [xSP]
5201 mov %1, [xSP]
5202 add xSP, 4
5203 ; Merge the status bits into the original MXCSR value.
5204 and %1, X86_MXCSR_XCPT_FLAGS
5205 or %1, %2
5206
5207 ldmxcsr [xSP]
5208 add xSP, 4
5209%endmacro
5210
5211
5212;;
5213; Floating point instruction working on two full sized registers.
5214;
5215; @param 1 The instruction
5216; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5217;
5218; @returns R0_32 The new MXCSR value of the guest.
5219; @param A0 The guest's MXCSR register value to use.
5220; @param A1 Where to return the result.
5221; @param A2 Pointer to the first media register size operand (input/output).
5222; @param A3 Pointer to the second media register size operand (input).
5223;
5224%macro IEMIMPL_FP_F2 2
5225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5226 PROLOGUE_4_ARGS
5227 IEMIMPL_SSE_PROLOGUE
5228 SSE_AVX_LD_MXCSR A0_32
5229
5230 movdqu xmm0, [A2]
5231 movdqu xmm1, [A3]
5232 %1 xmm0, xmm1
5233 movdqu [A1], xmm0
5234
5235 SSE_AVX_ST_MXCSR R0_32, A0_32
5236 IEMIMPL_SSE_PROLOGUE
5237 EPILOGUE_4_ARGS
5238ENDPROC iemAImpl_ %+ %1 %+ _u128
5239
5240 %if %2 == 3
5241BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5242 PROLOGUE_4_ARGS
5243 IEMIMPL_AVX_PROLOGUE
5244 SSE_AVX_LD_MXCSR A0_32
5245
5246 vmovdqu xmm0, [A2]
5247 vmovdqu xmm1, [A3]
5248 v %+ %1 xmm0, xmm0, xmm1
5249 vmovdqu [A1], xmm0
5250
5251 SSE_AVX_ST_MXCSR R0_32, A0_32
5252 IEMIMPL_AVX_PROLOGUE
5253 EPILOGUE_4_ARGS
5254ENDPROC iemAImpl_v %+ %1 %+ _u128
5255
5256BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5257 PROLOGUE_4_ARGS
5258 IEMIMPL_AVX_PROLOGUE
5259 SSE_AVX_LD_MXCSR A0_32
5260
5261 vmovdqu ymm0, [A2]
5262 vmovdqu ymm1, [A3]
5263 v %+ %1 ymm0, ymm0, ymm1
5264 vmovdqu [A1], ymm0
5265
5266 SSE_AVX_ST_MXCSR R0_32, A0_32
5267 IEMIMPL_AVX_PROLOGUE
5268 EPILOGUE_4_ARGS
5269ENDPROC iemAImpl_v %+ %1 %+ _u256
5270 %elif %2 == 2
5271BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5272 PROLOGUE_4_ARGS
5273 IEMIMPL_AVX_PROLOGUE
5274 SSE_AVX_LD_MXCSR A0_32
5275
5276 vmovdqu xmm0, [A2]
5277 vmovdqu xmm1, [A3]
5278 v %+ %1 xmm0, xmm1
5279 vmovdqu [A1], xmm0
5280
5281 SSE_AVX_ST_MXCSR R0_32, A0_32
5282 IEMIMPL_AVX_PROLOGUE
5283 EPILOGUE_4_ARGS
5284ENDPROC iemAImpl_v %+ %1 %+ _u128
5285
5286BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5287 PROLOGUE_4_ARGS
5288 IEMIMPL_AVX_PROLOGUE
5289 SSE_AVX_LD_MXCSR A0_32
5290
5291 vmovdqu ymm0, [A2]
5292 vmovdqu ymm1, [A3]
5293 v %+ %1 ymm0, ymm1
5294 vmovdqu [A1], ymm0
5295
5296 SSE_AVX_ST_MXCSR R0_32, A0_32
5297 IEMIMPL_AVX_PROLOGUE
5298 EPILOGUE_4_ARGS
5299ENDPROC iemAImpl_v %+ %1 %+ _u256
5300 %endif
5301%endmacro
5302
5303IEMIMPL_FP_F2 addps, 3
5304IEMIMPL_FP_F2 addpd, 3
5305IEMIMPL_FP_F2 mulps, 3
5306IEMIMPL_FP_F2 mulpd, 3
5307IEMIMPL_FP_F2 subps, 3
5308IEMIMPL_FP_F2 subpd, 3
5309IEMIMPL_FP_F2 minps, 3
5310IEMIMPL_FP_F2 minpd, 3
5311IEMIMPL_FP_F2 divps, 3
5312IEMIMPL_FP_F2 divpd, 3
5313IEMIMPL_FP_F2 maxps, 3
5314IEMIMPL_FP_F2 maxpd, 3
5315IEMIMPL_FP_F2 haddps, 3
5316IEMIMPL_FP_F2 haddpd, 3
5317IEMIMPL_FP_F2 hsubps, 3
5318IEMIMPL_FP_F2 hsubpd, 3
5319IEMIMPL_FP_F2 addsubps, 3
5320IEMIMPL_FP_F2 addsubpd, 3
5321
5322
5323;;
5324; These are actually unary operations but to keep it simple
5325; we treat them as binary for now, so the output result is
5326; always in sync with the register where the result might get written
5327; to.
5328IEMIMPL_FP_F2 sqrtps, 2
5329IEMIMPL_FP_F2 rsqrtps, 2
5330IEMIMPL_FP_F2 sqrtpd, 2
5331IEMIMPL_FP_F2 rcpps, 2
5332IEMIMPL_FP_F2 cvtdq2ps, 2
5333IEMIMPL_FP_F2 cvtps2dq, 2
5334IEMIMPL_FP_F2 cvttps2dq, 2
5335IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5336IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5337IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5338
5339
5340;;
5341; Floating point instruction working on a full sized register and a single precision operand.
5342;
5343; @param 1 The instruction
5344;
5345; @return R0_32 The new MXCSR value of the guest.
5346; @param A0 The guest's MXCSR register value to use.
5347; @param A1 Where to return the result.
5348; @param A2 Pointer to the first media register size operand (input/output).
5349; @param A3 Pointer to the second single precision floating point value (input).
5350;
5351%macro IEMIMPL_FP_F2_R32 1
5352BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5353 PROLOGUE_4_ARGS
5354 IEMIMPL_SSE_PROLOGUE
5355 SSE_AVX_LD_MXCSR A0_32
5356
5357 movdqu xmm0, [A2]
5358 movd xmm1, [A3]
5359 %1 xmm0, xmm1
5360 movdqu [A1], xmm0
5361
5362 SSE_AVX_ST_MXCSR R0_32, A0_32
5363 IEMIMPL_SSE_EPILOGUE
5364 EPILOGUE_4_ARGS
5365ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5366
5367BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5368 PROLOGUE_4_ARGS
5369 IEMIMPL_AVX_PROLOGUE
5370 SSE_AVX_LD_MXCSR A0_32
5371
5372 vmovdqu xmm0, [A2]
5373 vmovd xmm1, [A3]
5374 v %+ %1 xmm0, xmm0, xmm1
5375 vmovdqu [A1], xmm0
5376
5377 SSE_AVX_ST_MXCSR R0_32, A0_32
5378 IEMIMPL_AVX_PROLOGUE
5379 EPILOGUE_4_ARGS
5380ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5381%endmacro
5382
5383IEMIMPL_FP_F2_R32 addss
5384IEMIMPL_FP_F2_R32 mulss
5385IEMIMPL_FP_F2_R32 subss
5386IEMIMPL_FP_F2_R32 minss
5387IEMIMPL_FP_F2_R32 divss
5388IEMIMPL_FP_F2_R32 maxss
5389IEMIMPL_FP_F2_R32 cvtss2sd
5390IEMIMPL_FP_F2_R32 sqrtss
5391IEMIMPL_FP_F2_R32 rsqrtss
5392IEMIMPL_FP_F2_R32 rcpss
5393
5394
5395;;
5396; Floating point instruction working on a full sized register and a double precision operand.
5397;
5398; @param 1 The instruction
5399;
5400; @return R0_32 The new MXCSR value of the guest.
5401; @param A0 The guest's MXCSR register value to use.
5402; @param A1 Where to return the result.
5403; @param A2 Pointer to the first media register size operand (input/output).
5404; @param A3 Pointer to the second double precision floating point value (input).
5405;
5406%macro IEMIMPL_FP_F2_R64 1
5407BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5408 PROLOGUE_4_ARGS
5409 IEMIMPL_SSE_PROLOGUE
5410 SSE_AVX_LD_MXCSR A0_32
5411
5412 movdqu xmm0, [A2]
5413 movq xmm1, [A3]
5414 %1 xmm0, xmm1
5415 movdqu [A1], xmm0
5416
5417 SSE_AVX_ST_MXCSR R0_32, A0_32
5418 IEMIMPL_SSE_EPILOGUE
5419 EPILOGUE_4_ARGS
5420ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5421
5422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5423 PROLOGUE_4_ARGS
5424 IEMIMPL_AVX_PROLOGUE
5425 SSE_AVX_LD_MXCSR A0_32
5426
5427 vmovdqu xmm0, [A2]
5428 vmovq xmm1, [A3]
5429 v %+ %1 xmm0, xmm0, xmm1
5430 vmovdqu [A1], xmm0
5431
5432 SSE_AVX_ST_MXCSR R0_32, A0_32
5433 IEMIMPL_AVX_EPILOGUE
5434 EPILOGUE_4_ARGS
5435ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5436%endmacro
5437
5438IEMIMPL_FP_F2_R64 addsd
5439IEMIMPL_FP_F2_R64 mulsd
5440IEMIMPL_FP_F2_R64 subsd
5441IEMIMPL_FP_F2_R64 minsd
5442IEMIMPL_FP_F2_R64 divsd
5443IEMIMPL_FP_F2_R64 maxsd
5444IEMIMPL_FP_F2_R64 cvtsd2ss
5445IEMIMPL_FP_F2_R64 sqrtsd
5446
5447
5448;;
5449; Macro for the cvtpd2ps/cvtps2pd instructions.
5450;
5451; 1 The instruction name.
5452; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5453;
5454; @return R0_32 The new MXCSR value of the guest.
5455; @param A0_32 The guest's MXCSR register value to use.
5456; @param A1 Where to return the result.
5457; @param A2 Pointer to the first media register size operand (input/output).
5458; @param A3 Pointer to the second media register size operand (input).
5459;
5460%macro IEMIMPL_CVT_F2 2
5461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5462 PROLOGUE_4_ARGS
5463 IEMIMPL_SSE_PROLOGUE
5464 SSE_AVX_LD_MXCSR A0_32
5465
5466 movdqu xmm0, [A2]
5467 movdqu xmm1, [A3]
5468 %1 xmm0, xmm1
5469 movdqu [A1], xmm0
5470
5471 SSE_AVX_ST_MXCSR R0_32, A0_32
5472 IEMIMPL_SSE_EPILOGUE
5473 EPILOGUE_4_ARGS
5474ENDPROC iemAImpl_ %+ %1 %+ _u128
5475
5476BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5477 PROLOGUE_4_ARGS
5478 IEMIMPL_AVX_PROLOGUE
5479 SSE_AVX_LD_MXCSR A0_32
5480
5481 vmovdqu xmm0, [A2]
5482 vmovdqu xmm1, [A3]
5483 v %+ %1 xmm0, xmm1
5484 vmovdqu [A1], xmm0
5485
5486 SSE_AVX_ST_MXCSR R0_32, A0_32
5487 IEMIMPL_AVX_EPILOGUE
5488 EPILOGUE_4_ARGS
5489ENDPROC iemAImpl_v %+ %1 %+ _u128
5490
5491BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5492 PROLOGUE_4_ARGS
5493 IEMIMPL_AVX_PROLOGUE
5494 SSE_AVX_LD_MXCSR A0_32
5495
5496 vmovdqu ymm0, [A2]
5497 vmovdqu ymm1, [A3]
5498 %if %2 == 0
5499 v %+ %1 xmm0, ymm1
5500 %else
5501 v %+ %1 ymm0, xmm1
5502 %endif
5503 vmovdqu [A1], ymm0
5504
5505 SSE_AVX_ST_MXCSR R0_32, A0_32
5506 IEMIMPL_AVX_EPILOGUE
5507 EPILOGUE_4_ARGS
5508ENDPROC iemAImpl_v %+ %1 %+ _u256
5509%endmacro
5510
5511IEMIMPL_CVT_F2 cvtpd2ps, 0
5512IEMIMPL_CVT_F2 cvtps2pd, 1
5513
5514
5515;;
5516; shufps instructions with 8-bit immediates.
5517;
5518; @param A0 Pointer to the destination media register size operand (input/output).
5519; @param A1 Pointer to the first source media register size operand (input).
5520; @param A2 The 8-bit immediate
5521;
5522BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5523 PROLOGUE_3_ARGS
5524 IEMIMPL_SSE_PROLOGUE
5525
5526 movzx A2, A2_8 ; must clear top bits
5527 movdqu xmm0, [A0]
5528 movdqu xmm1, [A1]
5529 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5530 movdqu [A0], xmm0
5531
5532 IEMIMPL_SSE_EPILOGUE
5533 EPILOGUE_3_ARGS
5534 %assign bImm 0
5535 %rep 256
5536.imm %+ bImm:
5537 IBT_ENDBRxx_WITHOUT_NOTRACK
5538 shufps xmm0, xmm1, bImm
5539 ret
5540 int3
5541 %assign bImm bImm + 1
5542 %endrep
5543.immEnd:
5544ENDPROC iemAImpl_shufps_u128
5545
5546
5547;;
5548; shufpd instruction with 8-bit immediates.
5549;
5550; @param A0 Pointer to the destination media register size operand (input/output).
5551; @param A1 Pointer to the first source media register size operand (input).
5552; @param A2 The 8-bit immediate
5553;
5554BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5555 PROLOGUE_3_ARGS
5556 IEMIMPL_SSE_PROLOGUE
5557
5558 movzx A2, A2_8 ; must clear top bits
5559 movdqu xmm0, [A0]
5560 movdqu xmm1, [A1]
5561 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5562 movdqu [A0], xmm0
5563
5564 IEMIMPL_SSE_EPILOGUE
5565 EPILOGUE_3_ARGS
5566 %assign bImm 0
5567 %rep 256
5568.imm %+ bImm:
5569 IBT_ENDBRxx_WITHOUT_NOTRACK
5570 shufpd xmm0, xmm1, bImm
5571 ret
5572 %assign bImm bImm + 1
5573 %endrep
5574.immEnd:
5575ENDPROC iemAImpl_shufpd_u128
5576
5577
5578;;
5579; vshufp{s,d} instructions with 8-bit immediates.
5580;
5581; @param 1 The instruction name.
5582;
5583; @param A0 Pointer to the destination media register size operand (output).
5584; @param A1 Pointer to the first source media register size operand (input).
5585; @param A2 Pointer to the second source media register size operand (input).
5586; @param A3 The 8-bit immediate
5587;
5588%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5590 PROLOGUE_4_ARGS
5591 IEMIMPL_AVX_PROLOGUE
5592
5593 movzx A3, A3_8 ; must clear top bits
5594 movdqu xmm0, [A1]
5595 movdqu xmm1, [A2]
5596 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5597 movdqu [A0], xmm0
5598
5599 IEMIMPL_AVX_EPILOGUE
5600 EPILOGUE_4_ARGS
5601 %assign bImm 0
5602 %rep 256
5603.imm %+ bImm:
5604 IBT_ENDBRxx_WITHOUT_NOTRACK
5605 %1 xmm0, xmm0, xmm1, bImm
5606 ret
5607 %assign bImm bImm + 1
5608 %endrep
5609.immEnd:
5610ENDPROC iemAImpl_ %+ %1 %+ _u128
5611
5612BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5613 PROLOGUE_4_ARGS
5614 IEMIMPL_AVX_PROLOGUE
5615
5616 movzx A3, A3_8 ; must clear top bits
5617 vmovdqu ymm0, [A1]
5618 vmovdqu ymm1, [A2]
5619 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5620 vmovdqu [A0], ymm0
5621
5622 IEMIMPL_AVX_EPILOGUE
5623 EPILOGUE_4_ARGS
5624 %assign bImm 0
5625 %rep 256
5626.imm %+ bImm:
5627 IBT_ENDBRxx_WITHOUT_NOTRACK
5628 %1 ymm0, ymm0, ymm1, bImm
5629 ret
5630 %assign bImm bImm + 1
5631 %endrep
5632.immEnd:
5633ENDPROC iemAImpl_ %+ %1 %+ _u256
5634%endmacro
5635
5636IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5637IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5638
5639
5640;;
5641; One of the [p]blendv{b,ps,pd} variants
5642;
5643; @param 1 The instruction
5644;
5645; @param A0 Pointer to the first media register sized operand (input/output).
5646; @param A1 Pointer to the second media sized value (input).
5647; @param A2 Pointer to the media register sized mask value (input).
5648;
5649%macro IEMIMPL_P_BLEND 1
5650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5651 PROLOGUE_3_ARGS
5652 IEMIMPL_SSE_PROLOGUE
5653
5654 movdqu xmm0, [A2] ; This is implicit
5655 movdqu xmm1, [A0]
5656 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5657 %1 xmm1, xmm2
5658 movdqu [A0], xmm1
5659
5660 IEMIMPL_SSE_PROLOGUE
5661 EPILOGUE_3_ARGS
5662ENDPROC iemAImpl_ %+ %1 %+ _u128
5663%endmacro
5664
5665IEMIMPL_P_BLEND pblendvb
5666IEMIMPL_P_BLEND blendvps
5667IEMIMPL_P_BLEND blendvpd
5668
5669
5670;;
5671; One of the v[p]blendv{b,ps,pd} variants
5672;
5673; @param 1 The instruction
5674;
5675; @param A0 Pointer to the first media register sized operand (output).
5676; @param A1 Pointer to the first media register sized operand (input).
5677; @param A2 Pointer to the second media register sized operand (input).
5678; @param A3 Pointer to the media register sized mask value (input).
5679%macro IEMIMPL_AVX_P_BLEND 1
5680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5681 PROLOGUE_4_ARGS
5682 IEMIMPL_AVX_PROLOGUE
5683
5684 vmovdqu xmm0, [A1]
5685 vmovdqu xmm1, [A2]
5686 vmovdqu xmm2, [A3]
5687 %1 xmm0, xmm0, xmm1, xmm2
5688 vmovdqu [A0], xmm0
5689
5690 IEMIMPL_AVX_PROLOGUE
5691 EPILOGUE_4_ARGS
5692ENDPROC iemAImpl_ %+ %1 %+ _u128
5693
5694BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5695 PROLOGUE_4_ARGS
5696 IEMIMPL_AVX_PROLOGUE
5697
5698 vmovdqu ymm0, [A1]
5699 vmovdqu ymm1, [A2]
5700 vmovdqu ymm2, [A3]
5701 %1 ymm0, ymm0, ymm1, ymm2
5702 vmovdqu [A0], ymm0
5703
5704 IEMIMPL_AVX_PROLOGUE
5705 EPILOGUE_4_ARGS
5706ENDPROC iemAImpl_ %+ %1 %+ _u256
5707%endmacro
5708
5709IEMIMPL_AVX_P_BLEND vpblendvb
5710IEMIMPL_AVX_P_BLEND vblendvps
5711IEMIMPL_AVX_P_BLEND vblendvpd
5712
5713
5714;;
5715; palignr mm1, mm2/m64 instruction.
5716;
5717; @param A0 Pointer to the first media register sized operand (output).
5718; @param A1 The second register sized operand (input).
5719; @param A2 The 8-bit immediate.
5720BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5721 PROLOGUE_3_ARGS
5722 IEMIMPL_MMX_PROLOGUE
5723
5724 movzx A2, A2_8 ; must clear top bits
5725 movq mm0, [A0]
5726 movq mm1, A1
5727 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5728 movq [A0], mm0
5729
5730 IEMIMPL_MMX_EPILOGUE
5731 EPILOGUE_3_ARGS
5732 %assign bImm 0
5733 %rep 256
5734.imm %+ bImm:
5735 IBT_ENDBRxx_WITHOUT_NOTRACK
5736 palignr mm0, mm1, bImm
5737 ret
5738 %assign bImm bImm + 1
5739 %endrep
5740.immEnd:
5741ENDPROC iemAImpl_palignr_u64
5742
5743
5744;;
5745; SSE instructions with 8-bit immediates of the form
5746; xxx xmm1, xmm2, imm8.
5747; where the instruction encoding takes up 6 bytes.
5748;
5749; @param 1 The instruction name.
5750;
5751; @param A0 Pointer to the first media register size operand (input/output).
5752; @param A1 Pointer to the second source media register size operand (input).
5753; @param A2 The 8-bit immediate
5754;
5755%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5757 PROLOGUE_3_ARGS
5758 IEMIMPL_SSE_PROLOGUE
5759
5760 movzx A2, A2_8 ; must clear top bits
5761 movdqu xmm0, [A0]
5762 movdqu xmm1, [A1]
5763 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
5764 movdqu [A0], xmm0
5765
5766 IEMIMPL_SSE_EPILOGUE
5767 EPILOGUE_3_ARGS
5768 %assign bImm 0
5769 %rep 256
5770.imm %+ bImm:
5771 IBT_ENDBRxx_WITHOUT_NOTRACK
5772 %1 xmm0, xmm1, bImm
5773 ret
5774 int3
5775 %assign bImm bImm + 1
5776 %endrep
5777.immEnd:
5778ENDPROC iemAImpl_ %+ %1 %+ _u128
5779%endmacro
5780
5781IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5782IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5783IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5784IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5785IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5786IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5787IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5788
5789
5790;;
5791; AVX instructions with 8-bit immediates of the form
5792; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5793; where the instruction encoding takes up 6 bytes.
5794;
5795; @param 1 The instruction name.
5796; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5797; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5798;
5799; @param A0 Pointer to the destination media register size operand (output).
5800; @param A1 Pointer to the first source media register size operand (input).
5801; @param A2 Pointer to the second source media register size operand (input).
5802; @param A3 The 8-bit immediate
5803;
5804%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5805 %if %2 == 1
5806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5807 PROLOGUE_4_ARGS
5808 IEMIMPL_AVX_PROLOGUE
5809
5810 movzx A3, A3_8 ; must clear top bits
5811 movdqu xmm0, [A1]
5812 movdqu xmm1, [A2]
5813 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5814 movdqu [A0], xmm0
5815
5816 IEMIMPL_AVX_EPILOGUE
5817 EPILOGUE_4_ARGS
5818 %assign bImm 0
5819 %rep 256
5820.imm %+ bImm:
5821 IBT_ENDBRxx_WITHOUT_NOTRACK
5822 %1 xmm0, xmm0, xmm1, bImm
5823 ret
5824 int3
5825 %assign bImm bImm + 1
5826 %endrep
5827.immEnd:
5828ENDPROC iemAImpl_ %+ %1 %+ _u128
5829 %endif
5830
5831 %if %3 == 1
5832BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5833 PROLOGUE_4_ARGS
5834 IEMIMPL_AVX_PROLOGUE
5835
5836 movzx A3, A3_8 ; must clear top bits
5837 vmovdqu ymm0, [A1]
5838 vmovdqu ymm1, [A2]
5839 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5840 vmovdqu [A0], ymm0
5841
5842 IEMIMPL_AVX_EPILOGUE
5843 EPILOGUE_4_ARGS
5844 %assign bImm 0
5845 %rep 256
5846.imm %+ bImm:
5847 IBT_ENDBRxx_WITHOUT_NOTRACK
5848 %1 ymm0, ymm0, ymm1, bImm
5849 ret
5850 int3
5851 %assign bImm bImm + 1
5852 %endrep
5853.immEnd:
5854ENDPROC iemAImpl_ %+ %1 %+ _u256
5855 %endif
5856%endmacro
5857
5858IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5859IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5860IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5861IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5862IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5863IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5864IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5865IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5866IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5867
5868
5869;;
5870; AVX instructions with 8-bit immediates of the form
5871; xxx {x,y}mm1, {x,y}mm2, imm8.
5872; where the instruction encoding takes up 6 bytes.
5873;
5874; @param 1 The instruction name.
5875; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5876; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5877; @param 4 The number of bytes taken up by a single instance of the instruction.
5878;
5879; @param A0 Pointer to the destination media register size operand (output).
5880; @param A1 Pointer to the first source media register size operand (input).
5881; @param A2 The 8-bit immediate
5882;
5883%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
5884 %if %2 == 1
5885BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5886 PROLOGUE_4_ARGS
5887 IEMIMPL_AVX_PROLOGUE
5888
5889 movzx A2, A2_8 ; must clear top bits
5890 movdqu xmm1, [A1]
5891 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5892 movdqu [A0], xmm0
5893
5894 IEMIMPL_AVX_EPILOGUE
5895 EPILOGUE_4_ARGS
5896 %assign bImm 0
5897 %rep 256
5898.imm %+ bImm:
5899 IBT_ENDBRxx_WITHOUT_NOTRACK
5900 %1 xmm0, xmm1, bImm
5901 ret
5902 int3
5903 %assign bImm bImm + 1
5904 %endrep
5905.immEnd:
5906ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5907 %endif
5908
5909 %if %3 == 1
5910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5911 PROLOGUE_4_ARGS
5912 IEMIMPL_AVX_PROLOGUE
5913
5914 movzx A2, A2_8 ; must clear top bits
5915 vmovdqu ymm1, [A1]
5916 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5917 vmovdqu [A0], ymm0
5918
5919 IEMIMPL_AVX_EPILOGUE
5920 EPILOGUE_4_ARGS
5921 %assign bImm 0
5922 %rep 256
5923.imm %+ bImm:
5924 IBT_ENDBRxx_WITHOUT_NOTRACK
5925 %1 ymm0, ymm1, bImm
5926 ret
5927 int3
5928 %assign bImm bImm + 1
5929 %endrep
5930.immEnd:
5931ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5932 %endif
5933%endmacro
5934
5935IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
5936IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
5937IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
5938IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
5939
5940
5941;;
5942; Need to move this as well somewhere better?
5943;
5944struc IEMPCMPISTRXSRC
5945 .uSrc1 resd 4
5946 .uSrc2 resd 4
5947endstruc
5948
5949struc IEMPCMPESTRXSRC
5950 .uSrc1 resd 4
5951 .uSrc2 resd 4
5952 .u64Rax resd 2
5953 .u64Rdx resd 2
5954endstruc
5955
5956;;
5957; The pcmpistri instruction.
5958;
5959; @return R0_32 The new ECX value.
5960; @param A0 Pointer to the EFLAGS register.
5961; @param A1 Pointer to the first operand (input).
5962; @param A1 Pointer to the second operand (input).
5963; @param A3 The 8-bit immediate
5964;
5965BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5966 PROLOGUE_4_ARGS
5967 IEMIMPL_SSE_PROLOGUE
5968
5969 movzx A3, A3_8 ; must clear top bits
5970 movdqu xmm0, [A1]
5971 movdqu xmm1, [A2]
5972 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5973 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5974
5975 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5976 mov R0_32, ecx
5977
5978 IEMIMPL_SSE_EPILOGUE
5979 EPILOGUE_4_ARGS
5980 %assign bImm 0
5981 %rep 256
5982.imm %+ bImm:
5983 IBT_ENDBRxx_WITHOUT_NOTRACK
5984 pcmpistri xmm0, xmm1, bImm
5985 ret
5986 int3
5987 %assign bImm bImm + 1
5988 %endrep
5989.immEnd:
5990ENDPROC iemAImpl_pcmpistri_u128
5991
5992;;
5993; The pcmpestri instruction.
5994;
5995; @param A0 Pointer to the ECX register to store the result to (output).
5996; @param A1 Pointer to the EFLAGS register.
5997; @param A2 Pointer to the structure containing the source operands (input).
5998; @param A3 The 8-bit immediate
5999;
6000BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6001 PROLOGUE_4_ARGS
6002 IEMIMPL_SSE_PROLOGUE
6003
6004 movzx A3, A3_8 ; must clear top bits
6005 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6006 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6007 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6008 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6009 push xDX ; xDX can be A1 or A2 depending on the calling convention
6010 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6011 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6012 IBT_NOTRACK
6013 call T1
6014
6015 pop xDX
6016 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6017 mov [T2], ecx
6018
6019 IEMIMPL_SSE_EPILOGUE
6020 EPILOGUE_4_ARGS
6021 %assign bImm 0
6022 %rep 256
6023.imm %+ bImm:
6024 IBT_ENDBRxx_WITHOUT_NOTRACK
6025 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6026 pcmpestri xmm0, xmm1, bImm
6027 ret
6028 %assign bImm bImm + 1
6029 %endrep
6030.immEnd:
6031ENDPROC iemAImpl_pcmpestri_u128
6032
6033;;
6034; The pcmpistrm instruction template.
6035;
6036; @param A0 Pointer to the XMM0 register to store the result to (output).
6037; @param A1 Pointer to the EFLAGS register.
6038; @param A2 Pointer to the structure containing the source operands (input).
6039; @param A3 The 8-bit immediate
6040;
6041BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
6042 PROLOGUE_4_ARGS
6043 IEMIMPL_SSE_PROLOGUE
6044
6045 movzx A3, A3_8 ; must clear top bits
6046 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6047 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6048 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6049
6050 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6051 movdqu [A0], xmm0
6052
6053 IEMIMPL_SSE_EPILOGUE
6054 EPILOGUE_4_ARGS
6055 %assign bImm 0
6056 %rep 256
6057.imm %+ bImm:
6058 IBT_ENDBRxx_WITHOUT_NOTRACK
6059 pcmpistrm xmm1, xmm2, bImm
6060 ret
6061 int3
6062 %assign bImm bImm + 1
6063 %endrep
6064.immEnd:
6065ENDPROC iemAImpl_pcmpistrm_u128
6066
6067;;
6068; The pcmpestrm instruction template.
6069;
6070; @param A0 Pointer to the XMM0 register to store the result to (output).
6071; @param A1 Pointer to the EFLAGS register.
6072; @param A2 Pointer to the structure containing the source operands (input).
6073; @param A3 The 8-bit immediate
6074;
6075BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6076 PROLOGUE_4_ARGS
6077 IEMIMPL_SSE_PROLOGUE
6078
6079 movzx A3, A3_8 ; must clear top bits
6080 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6081 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6082 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6083 push xDX ; xDX can be A1 or A2 depending on the calling convention
6084 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6085 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6086 IBT_NOTRACK
6087 call T1
6088
6089 pop xDX
6090 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6091 movdqu [A0], xmm0
6092
6093 IEMIMPL_SSE_EPILOGUE
6094 EPILOGUE_4_ARGS
6095 %assign bImm 0
6096 %rep 256
6097.imm %+ bImm:
6098 IBT_ENDBRxx_WITHOUT_NOTRACK
6099 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6100 pcmpestrm xmm1, xmm2, bImm
6101 ret
6102 %assign bImm bImm + 1
6103 %endrep
6104.immEnd:
6105ENDPROC iemAImpl_pcmpestrm_u128
6106
6107
6108;;
6109; movmskp{s,d} SSE instruction template
6110;
6111; @param 1 The SSE instruction name.
6112; @param 2 The AVX instruction name.
6113;
6114; @param A0 Pointer to the output register (output/byte sized).
6115; @param A1 Pointer to the source media register size operand (input).
6116;
6117%macro IEMIMPL_MEDIA_MOVMSK_P 2
6118BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6119 PROLOGUE_2_ARGS
6120 IEMIMPL_SSE_PROLOGUE
6121
6122 movdqu xmm0, [A1]
6123 %1 T0, xmm0
6124 mov byte [A0], T0_8
6125
6126 IEMIMPL_SSE_EPILOGUE
6127 EPILOGUE_2_ARGS
6128ENDPROC iemAImpl_ %+ %1 %+ _u128
6129
6130BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6131 PROLOGUE_2_ARGS
6132 IEMIMPL_AVX_PROLOGUE
6133
6134 movdqu xmm0, [A1]
6135 %2 T0, xmm0
6136 mov byte [A0], T0_8
6137
6138 IEMIMPL_AVX_EPILOGUE
6139 EPILOGUE_2_ARGS
6140ENDPROC iemAImpl_ %+ %2 %+ _u128
6141
6142BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6143 PROLOGUE_2_ARGS
6144 IEMIMPL_AVX_PROLOGUE
6145
6146 vmovdqu ymm0, [A1]
6147 %2 T0, ymm0
6148 mov byte [A0], T0_8
6149
6150 IEMIMPL_AVX_EPILOGUE
6151 EPILOGUE_2_ARGS
6152ENDPROC iemAImpl_ %+ %2 %+ _u256
6153%endmacro
6154
6155IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6156IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6157
6158
6159;;
6160; cvttsd2si instruction - 32-bit variant.
6161;
6162; @return R0_32 The new MXCSR value of the guest.
6163; @param A0_32 The guest's MXCSR register value to use.
6164; @param A1 Pointer to the result operand (output).
6165; @param A2 Pointer to the second operand (input).
6166;
6167BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6168 PROLOGUE_4_ARGS
6169 IEMIMPL_SSE_PROLOGUE
6170 SSE_AVX_LD_MXCSR A0_32
6171
6172 cvttsd2si T0_32, [A2]
6173 mov dword [A1], T0_32
6174
6175 SSE_AVX_ST_MXCSR R0_32, A0_32
6176 IEMIMPL_SSE_EPILOGUE
6177 EPILOGUE_4_ARGS
6178ENDPROC iemAImpl_cvttsd2si_i32_r64
6179
6180;;
6181; cvttsd2si instruction - 64-bit variant.
6182;
6183; @return R0_32 The new MXCSR value of the guest.
6184; @param A0_32 The guest's MXCSR register value to use.
6185; @param A1 Pointer to the result operand (output).
6186; @param A2 Pointer to the second operand (input).
6187;
6188BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6189 PROLOGUE_3_ARGS
6190 IEMIMPL_SSE_PROLOGUE
6191 SSE_AVX_LD_MXCSR A0_32
6192
6193 cvttsd2si T0, [A2]
6194 mov qword [A1], T0
6195
6196 SSE_AVX_ST_MXCSR R0_32, A0_32
6197 IEMIMPL_SSE_EPILOGUE
6198 EPILOGUE_3_ARGS
6199ENDPROC iemAImpl_cvttsd2si_i64_r64
6200
6201
6202;;
6203; cvtsd2si instruction - 32-bit variant.
6204;
6205; @return R0_32 The new MXCSR value of the guest.
6206; @param A0_32 The guest's MXCSR register value to use.
6207; @param A1 Pointer to the result operand (output).
6208; @param A2 Pointer to the second operand (input).
6209;
6210BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6211 PROLOGUE_3_ARGS
6212 IEMIMPL_SSE_PROLOGUE
6213 SSE_AVX_LD_MXCSR A0_32
6214
6215 cvtsd2si T0_32, [A2]
6216 mov dword [A1], T0_32
6217
6218 SSE_AVX_ST_MXCSR R0_32, A0_32
6219 IEMIMPL_SSE_EPILOGUE
6220 EPILOGUE_3_ARGS
6221ENDPROC iemAImpl_cvtsd2si_i32_r64
6222
6223;;
6224; cvtsd2si instruction - 64-bit variant.
6225;
6226; @return R0_32 The new MXCSR value of the guest.
6227; @param A0_32 The guest's MXCSR register value to use.
6228; @param A1 Pointer to the result operand (output).
6229; @param A2 Pointer to the second operand (input).
6230;
6231BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6232 PROLOGUE_3_ARGS
6233 IEMIMPL_SSE_PROLOGUE
6234 SSE_AVX_LD_MXCSR A0_32
6235
6236 cvtsd2si T0, [A2]
6237 mov qword [A1], T0
6238
6239 SSE_AVX_ST_MXCSR R0_32, A0_32
6240 IEMIMPL_SSE_EPILOGUE
6241 EPILOGUE_3_ARGS
6242ENDPROC iemAImpl_cvtsd2si_i64_r64
6243
6244
6245;;
6246; cvttss2si instruction - 32-bit variant.
6247;
6248; @return R0_32 The new MXCSR value of the guest.
6249; @param A0_32 The guest's MXCSR register value to use.
6250; @param A1 Pointer to the result operand (output).
6251; @param A2 Pointer to the second operand (input).
6252;
6253BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6254 PROLOGUE_3_ARGS
6255 IEMIMPL_SSE_PROLOGUE
6256 SSE_AVX_LD_MXCSR A0_32
6257
6258 cvttss2si T0_32, [A2]
6259 mov dword [A1], T0_32
6260
6261 SSE_AVX_ST_MXCSR R0_32, A0_32
6262 IEMIMPL_SSE_EPILOGUE
6263 EPILOGUE_3_ARGS
6264ENDPROC iemAImpl_cvttss2si_i32_r32
6265
6266;;
6267; cvttss2si instruction - 64-bit variant.
6268;
6269; @return R0_32 The new MXCSR value of the guest.
6270; @param A0_32 The guest's MXCSR register value to use.
6271; @param A1 Pointer to the result operand (output).
6272; @param A2 Pointer to the second operand (input).
6273;
6274BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6275 PROLOGUE_3_ARGS
6276 IEMIMPL_SSE_PROLOGUE
6277 SSE_AVX_LD_MXCSR A0_32
6278
6279 cvttss2si T0, [A2]
6280 mov qword [A1], T0
6281
6282 SSE_AVX_ST_MXCSR R0_32, A0_32
6283 IEMIMPL_SSE_EPILOGUE
6284 EPILOGUE_3_ARGS
6285ENDPROC iemAImpl_cvttss2si_i64_r32
6286
6287
6288;;
6289; cvtss2si instruction - 32-bit variant.
6290;
6291; @return R0_32 The new MXCSR value of the guest.
6292; @param A0_32 The guest's MXCSR register value to use.
6293; @param A1 Pointer to the result operand (output).
6294; @param A2 Pointer to the second operand (input).
6295;
6296BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6297 PROLOGUE_3_ARGS
6298 IEMIMPL_SSE_PROLOGUE
6299 SSE_AVX_LD_MXCSR A0_32
6300
6301 cvtss2si T0_32, [A2]
6302 mov dword [A1], T0_32
6303
6304 SSE_AVX_ST_MXCSR R0_32, A0_32
6305 IEMIMPL_SSE_EPILOGUE
6306 EPILOGUE_3_ARGS
6307ENDPROC iemAImpl_cvtss2si_i32_r32
6308
6309;;
6310; cvtss2si instruction - 64-bit variant.
6311;
6312; @return R0_32 The new MXCSR value of the guest.
6313; @param A0_32 The guest's MXCSR register value to use.
6314; @param A1 Pointer to the result operand (output).
6315; @param A2 Pointer to the second operand (input).
6316;
6317BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6318 PROLOGUE_3_ARGS
6319 IEMIMPL_SSE_PROLOGUE
6320 SSE_AVX_LD_MXCSR A0_32
6321
6322 cvtss2si T0, [A2]
6323 mov qword [A1], T0
6324
6325 SSE_AVX_ST_MXCSR R0_32, A0_32
6326 IEMIMPL_SSE_EPILOGUE
6327 EPILOGUE_3_ARGS
6328ENDPROC iemAImpl_cvtss2si_i64_r32
6329
6330
6331;;
6332; cvtsi2ss instruction - 32-bit variant.
6333;
6334; @return R0_32 The new MXCSR value of the guest.
6335; @param A0_32 The guest's MXCSR register value to use.
6336; @param A1 Pointer to the result operand (output).
6337; @param A2 Pointer to the second operand (input).
6338;
6339BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6340 PROLOGUE_3_ARGS
6341 IEMIMPL_SSE_PROLOGUE
6342 SSE_AVX_LD_MXCSR A0_32
6343
6344 cvtsi2ss xmm0, dword [A2]
6345 movd dword [A1], xmm0
6346
6347 SSE_AVX_ST_MXCSR R0_32, A0_32
6348 IEMIMPL_SSE_EPILOGUE
6349 EPILOGUE_3_ARGS
6350ENDPROC iemAImpl_cvtsi2ss_r32_i32
6351
6352;;
6353; cvtsi2ss instruction - 64-bit variant.
6354;
6355; @return R0_32 The new MXCSR value of the guest.
6356; @param A0_32 The guest's MXCSR register value to use.
6357; @param A1 Pointer to the result operand (output).
6358; @param A2 Pointer to the second operand (input).
6359;
6360BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6361 PROLOGUE_3_ARGS
6362 IEMIMPL_SSE_PROLOGUE
6363 SSE_AVX_LD_MXCSR A0_32
6364
6365 cvtsi2ss xmm0, qword [A2]
6366 movd dword [A1], xmm0
6367
6368 SSE_AVX_ST_MXCSR R0_32, A0_32
6369 IEMIMPL_SSE_EPILOGUE
6370 EPILOGUE_3_ARGS
6371ENDPROC iemAImpl_cvtsi2ss_r32_i64
6372
6373
6374;;
6375; cvtsi2sd instruction - 32-bit variant.
6376;
6377; @return R0_32 The new MXCSR value of the guest.
6378; @param A0_32 The guest's MXCSR register value to use.
6379; @param A1 Pointer to the result operand (output).
6380; @param A2 Pointer to the second operand (input).
6381;
6382BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6383 PROLOGUE_3_ARGS
6384 IEMIMPL_SSE_PROLOGUE
6385 SSE_AVX_LD_MXCSR A0_32
6386
6387 cvtsi2sd xmm0, dword [A2]
6388 movq [A1], xmm0
6389
6390 SSE_AVX_ST_MXCSR R0_32, A0_32
6391 IEMIMPL_SSE_EPILOGUE
6392 EPILOGUE_3_ARGS
6393ENDPROC iemAImpl_cvtsi2sd_r64_i32
6394
6395;;
6396; cvtsi2sd instruction - 64-bit variant.
6397;
6398; @return R0_32 The new MXCSR value of the guest.
6399; @param A0_32 The guest's MXCSR register value to use.
6400; @param A1 Pointer to the result operand (output).
6401; @param A2 Pointer to the second operand (input).
6402;
6403BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6404 PROLOGUE_3_ARGS
6405 IEMIMPL_SSE_PROLOGUE
6406 SSE_AVX_LD_MXCSR A0_32
6407
6408 cvtsi2sd xmm0, qword [A2]
6409 movq [A1], xmm0
6410
6411 SSE_AVX_ST_MXCSR R0_32, A0_32
6412 IEMIMPL_SSE_EPILOGUE
6413 EPILOGUE_3_ARGS
6414ENDPROC iemAImpl_cvtsi2sd_r64_i64
6415
6416
6417;
6418; UCOMISS (SSE)
6419;
6420; @return R0_32 The new MXCSR value of the guest.
6421; @param A0_32 The guest's MXCSR register value to use (input).
6422; @param A1 Pointer to the EFLAGS value (input/output).
6423; @param A2_32 The first source operand.
6424; @param A3_32 The second source operand.
6425;
6426BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6427 PROLOGUE_4_ARGS
6428 IEMIMPL_SSE_PROLOGUE
6429 SSE_AVX_LD_MXCSR A0_32
6430
6431 movd xmm0, A2_32
6432 movd xmm1, A3_32
6433 ucomiss xmm0, xmm1
6434 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6435
6436 SSE_AVX_ST_MXCSR R0_32, A0_32
6437 IEMIMPL_SSE_EPILOGUE
6438 EPILOGUE_4_ARGS
6439ENDPROC iemAImpl_ucomiss_u128
6440
6441BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6442 PROLOGUE_4_ARGS
6443 IEMIMPL_SSE_PROLOGUE
6444 SSE_AVX_LD_MXCSR A0_32
6445
6446 movd xmm0, A2_32
6447 movd xmm1, A3_32
6448 vucomiss xmm0, xmm1
6449 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6450
6451 SSE_AVX_ST_MXCSR R0_32, A0_32
6452 IEMIMPL_SSE_EPILOGUE
6453 EPILOGUE_3_ARGS
6454ENDPROC iemAImpl_vucomiss_u128
6455
6456
6457;
6458; UCOMISD (SSE)
6459;
6460; @return R0_32 The new MXCSR value of the guest.
6461; @param A0_32 The guest's MXCSR register value to use (input).
6462; @param A1 Pointer to the EFLAGS value (input/output).
6463; @param A2 The first source operand.
6464; @param A3 The second source operand.
6465;
6466BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6467 PROLOGUE_4_ARGS
6468 IEMIMPL_SSE_PROLOGUE
6469 SSE_AVX_LD_MXCSR A0_32
6470
6471 movq xmm0, A2
6472 movq xmm1, A3
6473 ucomisd xmm0, xmm1
6474 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6475
6476 SSE_AVX_ST_MXCSR R0_32, A0_32
6477 IEMIMPL_SSE_EPILOGUE
6478 EPILOGUE_4_ARGS
6479ENDPROC iemAImpl_ucomisd_u128
6480
6481BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6482 PROLOGUE_4_ARGS
6483 IEMIMPL_SSE_PROLOGUE
6484 SSE_AVX_LD_MXCSR A0_32
6485
6486 movq xmm0, A2
6487 movq xmm1, A3
6488 vucomisd xmm0, xmm1
6489 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6490
6491 SSE_AVX_ST_MXCSR R0_32, A0_32
6492 IEMIMPL_SSE_EPILOGUE
6493 EPILOGUE_4_ARGS
6494ENDPROC iemAImpl_vucomisd_u128
6495
6496;
6497; COMISS (SSE)
6498;
6499; @return R0_32 The new MXCSR value of the guest.
6500; @param A0_32 The guest's MXCSR register value to use (input).
6501; @param A1 Pointer to the EFLAGS value (input/output).
6502; @param A2_32 The first source operand.
6503; @param A3_32 The second source operand.
6504;
6505BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6506 PROLOGUE_4_ARGS
6507 IEMIMPL_SSE_PROLOGUE
6508 SSE_AVX_LD_MXCSR A0_32
6509
6510 movd xmm0, A2_32
6511 movd xmm1, A3_32
6512 comiss xmm0, xmm1
6513 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6514
6515 SSE_AVX_ST_MXCSR R0_32, A0_32
6516 IEMIMPL_SSE_EPILOGUE
6517 EPILOGUE_4_ARGS
6518ENDPROC iemAImpl_comiss_u128
6519
6520BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6521 PROLOGUE_4_ARGS
6522 IEMIMPL_SSE_PROLOGUE
6523 SSE_AVX_LD_MXCSR A0_32
6524
6525 movd xmm0, A2_32
6526 movd xmm1, A3_32
6527 vcomiss xmm0, xmm1
6528 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6529
6530 SSE_AVX_ST_MXCSR R0_32, A0_32
6531 IEMIMPL_SSE_EPILOGUE
6532 EPILOGUE_4_ARGS
6533ENDPROC iemAImpl_vcomiss_u128
6534
6535
6536;
6537; COMISD (SSE)
6538;
6539; @return R0_32 The new MXCSR value of the guest.
6540; @param A0_32 The guest's MXCSR register value to use (input).
6541; @param A1 Pointer to the EFLAGS value (input/output).
6542; @param A2 The first source operand.
6543; @param A3 The second source operand.
6544;
6545BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6546 PROLOGUE_4_ARGS
6547 IEMIMPL_SSE_PROLOGUE
6548 SSE_AVX_LD_MXCSR A0_32
6549
6550 movq xmm0, A2
6551 movq xmm1, A3
6552 comisd xmm0, xmm1
6553 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6554
6555 SSE_AVX_ST_MXCSR R0_32, A0_32
6556 IEMIMPL_SSE_EPILOGUE
6557 EPILOGUE_4_ARGS
6558ENDPROC iemAImpl_comisd_u128
6559
6560BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6561 PROLOGUE_4_ARGS
6562 IEMIMPL_SSE_PROLOGUE
6563 SSE_AVX_LD_MXCSR A0_32
6564
6565 movq xmm0, A2
6566 movq xmm1, A3
6567 vcomisd xmm0, xmm1
6568 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6569
6570 SSE_AVX_ST_MXCSR R0_32, A0_32
6571 IEMIMPL_SSE_EPILOGUE
6572 EPILOGUE_4_ARGS
6573ENDPROC iemAImpl_vcomisd_u128
6574
6575
6576;;
6577; Need to move this as well somewhere better?
6578;
6579struc IEMMEDIAF2XMMSRC
6580 .uSrc1 resd 4
6581 .uSrc2 resd 4
6582endstruc
6583
6584
6585;
6586; CMPPS (SSE)
6587;
6588; @return R0_32 The new MXCSR value of the guest.
6589; @param A0_32 The guest's MXCSR register value to use (input).
6590; @param A1 Pointer to the first media register size operand (output).
6591; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6592; @param A3 The 8-bit immediate (input).
6593;
6594BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6595 PROLOGUE_4_ARGS
6596 IEMIMPL_SSE_PROLOGUE
6597 SSE_AVX_LD_MXCSR A0_32
6598
6599 movzx A3, A3_8 ; must clear top bits
6600 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6601 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6602 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 5
6603 movdqu [A1], xmm0
6604
6605 SSE_AVX_ST_MXCSR R0_32, A0_32
6606 IEMIMPL_SSE_EPILOGUE
6607 EPILOGUE_4_ARGS
6608 %assign bImm 0
6609 %rep 256
6610.imm %+ bImm:
6611 IBT_ENDBRxx_WITHOUT_NOTRACK
6612 cmpps xmm0, xmm1, bImm
6613 ret
6614 %assign bImm bImm + 1
6615 %endrep
6616.immEnd:
6617ENDPROC iemAImpl_cmpps_u128
6618
6619;;
6620; SSE instructions with 8-bit immediates of the form
6621; xxx xmm1, xmm2, imm8.
6622; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6623; register.
6624;
6625; @param 1 The instruction name.
6626;
6627; @return R0_32 The new MXCSR value of the guest.
6628; @param A0_32 The guest's MXCSR register value to use (input).
6629; @param A1 Pointer to the first media register size operand (output).
6630; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6631; @param A3 The 8-bit immediate (input).
6632;
6633%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6635 PROLOGUE_4_ARGS
6636 IEMIMPL_SSE_PROLOGUE
6637 SSE_AVX_LD_MXCSR A0_32
6638
6639 movzx A3, A3_8 ; must clear top bits
6640 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6641 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6642 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
6643 movdqu [A1], xmm0
6644
6645 SSE_AVX_ST_MXCSR R0_32, A0_32
6646 IEMIMPL_SSE_EPILOGUE
6647 EPILOGUE_4_ARGS
6648 %assign bImm 0
6649 %rep 256
6650.imm %+ bImm:
6651 IBT_ENDBRxx_WITHOUT_NOTRACK
6652 %1 xmm0, xmm1, bImm
6653 ret
6654 %assign bImm bImm + 1
6655 %endrep
6656.immEnd:
6657ENDPROC iemAImpl_ %+ %1 %+ _u128
6658%endmacro
6659
6660IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6661IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6662IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6663
6664;;
6665; SSE instructions with 8-bit immediates of the form
6666; xxx xmm1, xmm2, imm8.
6667; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6668; register.
6669;
6670; @param 1 The instruction name.
6671;
6672; @return R0_32 The new MXCSR value of the guest.
6673; @param A0_32 The guest's MXCSR register value to use (input).
6674; @param A1 Pointer to the first media register size operand (output).
6675; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6676; @param A3 The 8-bit immediate (input).
6677;
6678%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6680 PROLOGUE_4_ARGS
6681 IEMIMPL_SSE_PROLOGUE
6682 SSE_AVX_LD_MXCSR A0_32
6683
6684 movzx A3, A3_8 ; must clear top bits
6685 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6686 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6687 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6688 movdqu [A1], xmm0
6689
6690 SSE_AVX_ST_MXCSR R0_32, A0_32
6691 IEMIMPL_SSE_EPILOGUE
6692 EPILOGUE_4_ARGS
6693 %assign bImm 0
6694 %rep 256
6695.imm %+ bImm:
6696 IBT_ENDBRxx_WITHOUT_NOTRACK
6697 %1 xmm0, xmm1, bImm
6698 ret
6699 int3
6700 %assign bImm bImm + 1
6701 %endrep
6702.immEnd:
6703ENDPROC iemAImpl_ %+ %1 %+ _u128
6704%endmacro
6705
6706IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6707IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6708IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6709IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6710IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6711IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6712
6713
6714;;
6715; SSE instructions of the form
6716; xxx mm, xmm.
6717; and we need to load and save the MXCSR register.
6718;
6719; @param 1 The instruction name.
6720;
6721; @return R0_32 The new MXCSR value of the guest.
6722; @param A0_32 The guest's MXCSR register value to use (input).
6723; @param A1 Pointer to the first MMX register sized operand (output).
6724; @param A2 Pointer to the media register sized operand (input).
6725;
6726%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6727BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6728 PROLOGUE_3_ARGS
6729 IEMIMPL_SSE_PROLOGUE
6730 SSE_AVX_LD_MXCSR A0_32
6731
6732 movdqu xmm0, [A2]
6733 %1 mm0, xmm0
6734 movq [A1], mm0
6735
6736 SSE_AVX_ST_MXCSR R0_32, A0_32
6737 IEMIMPL_SSE_EPILOGUE
6738 EPILOGUE_3_ARGS
6739ENDPROC iemAImpl_ %+ %1 %+ _u128
6740%endmacro
6741
6742IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6743IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6744
6745;;
6746; SSE instructions of the form
6747; xxx xmm, xmm/m64.
6748; and we need to load and save the MXCSR register.
6749;
6750; @param 1 The instruction name.
6751;
6752; @return R0_32 The new MXCSR value of the guest.
6753; @param A0_32 The guest's MXCSR register value to use (input).
6754; @param A1 Pointer to the first media register sized operand (input/output).
6755; @param A2 The 64bit source value from a MMX media register (input)
6756;
6757%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6758BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6759 PROLOGUE_3_ARGS
6760 IEMIMPL_SSE_PROLOGUE
6761 SSE_AVX_LD_MXCSR A0_32
6762
6763 movdqu xmm0, [A1]
6764 movq mm0, A2
6765 %1 xmm0, mm0
6766 movdqu [A1], xmm0
6767
6768 SSE_AVX_ST_MXCSR R0_32, A0_32
6769 IEMIMPL_SSE_EPILOGUE
6770 EPILOGUE_3_ARGS
6771ENDPROC iemAImpl_ %+ %1 %+ _u128
6772%endmacro
6773
6774IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6775IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6776
6777;;
6778; SSE instructions of the form
6779; xxx mm, xmm/m64.
6780; and we need to load and save the MXCSR register.
6781;
6782; @param 1 The instruction name.
6783;
6784; @return R0_32 The new MXCSR value of the guest.
6785; @param A0_32 The guest's MXCSR register value to use (input).
6786; @param A1 Pointer to the first MMX media register sized operand (output).
6787; @param A2 The 64bit source value (input).
6788;
6789%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6791 PROLOGUE_3_ARGS
6792 IEMIMPL_SSE_PROLOGUE
6793 SSE_AVX_LD_MXCSR A0_32
6794
6795 movq xmm0, A2
6796 %1 mm0, xmm0
6797 movq [A1], mm0
6798
6799 SSE_AVX_ST_MXCSR R0_32, A0_32
6800 IEMIMPL_SSE_EPILOGUE
6801 EPILOGUE_3_ARGS
6802ENDPROC iemAImpl_ %+ %1 %+ _u128
6803%endmacro
6804
6805IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6806IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6807
6808;
6809; All forms of RDRAND and RDSEED
6810;
6811; @param A0 Pointer to the destination operand.
6812; @param A1 Pointer to the EFLAGS value (input/output).
6813;
6814%macro IEMIMPL_RDRAND_RDSEED 3
6815BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6816 PROLOGUE_2_ARGS
6817
6818 %1 %2
6819 mov [A0], %2
6820 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
6821
6822 EPILOGUE_2_ARGS
6823ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6824%endmacro
6825
6826IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6827IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6828IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6829IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6830IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6831IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6832
6833
6834;;
6835; sha1rnds4 xmm1, xmm2, imm8.
6836;
6837; @param 1 The instruction name.
6838;
6839; @param A0 Pointer to the first media register size operand (input/output).
6840; @param A1 Pointer to the second source media register size operand (input).
6841; @param A2 The 8-bit immediate
6842;
6843BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6844 PROLOGUE_3_ARGS
6845 IEMIMPL_SSE_PROLOGUE
6846
6847 movzx A2, A2_8 ; must clear top bits
6848 movdqu xmm0, [A0]
6849 movdqu xmm1, [A1]
6850 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6851 movdqu [A0], xmm0
6852
6853 IEMIMPL_SSE_EPILOGUE
6854 EPILOGUE_3_ARGS
6855 %assign bImm 0
6856 %rep 256
6857.imm %+ bImm:
6858 IBT_ENDBRxx_WITHOUT_NOTRACK
6859 sha1rnds4 xmm0, xmm1, bImm
6860 ret
6861 %assign bImm bImm + 1
6862 %endrep
6863.immEnd:
6864ENDPROC iemAImpl_sha1rnds4_u128
6865
6866
6867;;
6868; sha256rnds2 xmm1, xmm2, <XMM0>.
6869;
6870; @param 1 The instruction name.
6871;
6872; @param A0 Pointer to the first media register size operand (input/output).
6873; @param A1 Pointer to the second source media register size operand (input).
6874; @param A2 Pointer to the implicit XMM0 constants (input).
6875;
6876BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6877 PROLOGUE_3_ARGS
6878 IEMIMPL_SSE_PROLOGUE
6879
6880 movdqu xmm0, [A2]
6881 movdqu xmm1, [A0]
6882 movdqu xmm2, [A1]
6883 sha256rnds2 xmm1, xmm2
6884 movdqu [A0], xmm1
6885
6886 IEMIMPL_SSE_EPILOGUE
6887 EPILOGUE_3_ARGS
6888ENDPROC iemAImpl_sha256rnds2_u128
6889
6890
6891;
6892; 32-bit forms of ADCX and ADOX
6893;
6894; @returns Updated EFLAGS.
6895; @param A0 Incoming EFLAGS value (input).
6896; @param A1 Pointer to the destination operand (input/output).
6897; @param A2 32-bit source operand 1 (input).
6898;
6899%macro IEMIMPL_ADX_32 2
6900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6901 PROLOGUE_4_ARGS
6902
6903 IEM_LOAD_FLAGS A0_32, %2, 0
6904 %1 A2_32, [A1]
6905 mov [A1], A2_32
6906 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
6907
6908 EPILOGUE_4_ARGS
6909ENDPROC iemAImpl_ %+ %1 %+ _u32
6910%endmacro
6911
6912;
6913; 64-bit forms of ADCX and ADOX
6914;
6915; @returns Updated EFLAGS.
6916; @param A0 Incoming EFLAGS value (input).
6917; @param A1 Pointer to the destination operand (input/output).
6918; @param A2 64-bit source operand 1 (input).
6919;
6920%macro IEMIMPL_ADX_64 2
6921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6922 PROLOGUE_4_ARGS
6923
6924 IEM_LOAD_FLAGS A0_32, %2, 0
6925 %1 A2, [A1]
6926 mov [A1], A2
6927 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
6928
6929 EPILOGUE_4_ARGS
6930ENDPROC iemAImpl_ %+ %1 %+ _u64
6931%endmacro
6932
6933IEMIMPL_ADX_32 adcx, X86_EFL_CF
6934IEMIMPL_ADX_64 adcx, X86_EFL_CF
6935
6936IEMIMPL_ADX_32 adox, X86_EFL_OF
6937IEMIMPL_ADX_64 adox, X86_EFL_OF
6938
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette