VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 104206

Last change on this file since 104206 was 104206, checked in by vboxsync, 8 months ago

VMM/IEM: Refactoring assembly helpers to not pass eflags by reference but instead by value and return the updated value (via eax/w0) - first chunk: IMUL(two ops), BSF, BSR, LZCNT, TZCNT, POPCNT. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 203.4 KB
Line 
1; $Id: IEMAllAImpl.asm 104206 2024-04-05 20:28:19Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A0,
1086; the source register operand in A1 and a pointer to eflags in A2.
1087;
1088; @param 1 The instruction mnemonic.
1089; @param 2 The modified flags.
1090; @param 3 The undefined flags.
1091; @param 4 The zeroed flags.
1092;
1093%macro IEMIMPL_VEX_BIN_OP_2 4
1094BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1095 PROLOGUE_4_ARGS
1096 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1097 mov T0_32, [A0]
1098 %1 T0_32, A1_32
1099 mov [A0], T0_32
1100 IEM_SAVE_FLAGS_OLD A2, %2, %3, %4
1101 EPILOGUE_4_ARGS
1102ENDPROC iemAImpl_ %+ %1 %+ _u32
1103
1104 %ifdef RT_ARCH_AMD64
1105BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1106 PROLOGUE_4_ARGS
1107 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, 0
1108 mov T0, [A0]
1109 %1 T0, A1
1110 mov [A0], T0
1111 IEM_SAVE_FLAGS_OLD A2, %2, %3, %4
1112 EPILOGUE_4_ARGS
1113ENDPROC iemAImpl_ %+ %1 %+ _u64
1114 %endif ; RT_ARCH_AMD64
1115%endmacro
1116
1117; instr, modified-flags, undefined-flags zeroed-flags
1118IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1119IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121
1122
1123;;
1124; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1125;
1126; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1127; where the 64-bit accesses requires hand coding.
1128;
1129; All the functions takes a pointer to the destination memory operand in A0,
1130; the first source register operand in A1, the second source register operand
1131; in A2 and a pointer to eflags in A3.
1132;
1133; @param 1 The instruction mnemonic.
1134; @param 2 Fallback instruction if applicable.
1135; @param 3 Whether to emit fallback or not.
1136;
1137%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1138BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1139 PROLOGUE_3_ARGS
1140 %1 T0_32, A1_32, A2_32
1141 mov [A0], T0_32
1142 EPILOGUE_3_ARGS
1143ENDPROC iemAImpl_ %+ %1 %+ _u32
1144
1145 %if %3
1146BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1147 PROLOGUE_3_ARGS
1148 %ifdef ASM_CALL64_GCC
1149 mov cl, A2_8
1150 %2 A1_32, cl
1151 mov [A0], A1_32
1152 %else
1153 xchg A2, A0
1154 %2 A1_32, cl
1155 mov [A2], A1_32
1156 %endif
1157 EPILOGUE_3_ARGS
1158ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1159 %endif
1160
1161 %ifdef RT_ARCH_AMD64
1162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1163 PROLOGUE_3_ARGS
1164 %1 T0, A1, A2
1165 mov [A0], T0
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_ %+ %1 %+ _u64
1168
1169 %if %3
1170BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1171 PROLOGUE_3_ARGS
1172 %ifdef ASM_CALL64_GCC
1173 mov cl, A2_8
1174 %2 A1, cl
1175 mov [A0], A1_32
1176 %else
1177 xchg A2, A0
1178 %2 A1, cl
1179 mov [A2], A1_32
1180 %endif
1181 mov [A0], A1
1182 EPILOGUE_3_ARGS
1183ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1184 %endif
1185 %endif ; RT_ARCH_AMD64
1186%endmacro
1187
1188; instr, fallback instr, emit fallback
1189IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1190IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1193IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1194
1195
1196;
1197; RORX uses a immediate byte for the shift count, so we only do
1198; fallback implementation of that one.
1199;
1200BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1201 PROLOGUE_3_ARGS
1202 %ifdef ASM_CALL64_GCC
1203 mov cl, A2_8
1204 ror A1_32, cl
1205 mov [A0], A1_32
1206 %else
1207 xchg A2, A0
1208 ror A1_32, cl
1209 mov [A2], A1_32
1210 %endif
1211 EPILOGUE_3_ARGS
1212ENDPROC iemAImpl_rorx_u32
1213
1214 %ifdef RT_ARCH_AMD64
1215BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1216 PROLOGUE_3_ARGS
1217 %ifdef ASM_CALL64_GCC
1218 mov cl, A2_8
1219 ror A1, cl
1220 mov [A0], A1
1221 %else
1222 xchg A2, A0
1223 ror A1, cl
1224 mov [A2], A1
1225 %endif
1226 EPILOGUE_3_ARGS
1227ENDPROC iemAImpl_rorx_u64
1228 %endif ; RT_ARCH_AMD64
1229
1230
1231;
1232; MULX
1233;
1234BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1235 PROLOGUE_4_ARGS
1236%ifdef ASM_CALL64_GCC
1237 ; A2_32 is EDX - prefect
1238 mulx T0_32, T1_32, A3_32
1239 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1240 mov [A0], T0_32
1241%else
1242 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1243 xchg A1, A2
1244 mulx T0_32, T1_32, A3_32
1245 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1246 mov [A0], T0_32
1247%endif
1248 EPILOGUE_4_ARGS
1249ENDPROC iemAImpl_mulx_u32
1250
1251
1252BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1253 PROLOGUE_4_ARGS
1254%ifdef ASM_CALL64_GCC
1255 ; A2_32 is EDX, T0_32 is EAX
1256 mov eax, A3_32
1257 mul A2_32
1258 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1259 mov [A0], edx
1260%else
1261 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1262 xchg A1, A2
1263 mov eax, A3_32
1264 mul A2_32
1265 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1266 mov [A0], edx
1267%endif
1268 EPILOGUE_4_ARGS
1269ENDPROC iemAImpl_mulx_u32_fallback
1270
1271%ifdef RT_ARCH_AMD64
1272BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1273 PROLOGUE_4_ARGS
1274%ifdef ASM_CALL64_GCC
1275 ; A2 is RDX - prefect
1276 mulx T0, T1, A3
1277 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1278 mov [A0], T0
1279%else
1280 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1281 xchg A1, A2
1282 mulx T0, T1, A3
1283 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1284 mov [A0], T0
1285%endif
1286 EPILOGUE_4_ARGS
1287ENDPROC iemAImpl_mulx_u64
1288
1289
1290BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1291 PROLOGUE_4_ARGS
1292%ifdef ASM_CALL64_GCC
1293 ; A2 is RDX, T0 is RAX
1294 mov rax, A3
1295 mul A2
1296 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1297 mov [A0], rdx
1298%else
1299 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1300 xchg A1, A2
1301 mov rax, A3
1302 mul A2
1303 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1304 mov [A0], rdx
1305%endif
1306 EPILOGUE_4_ARGS
1307ENDPROC iemAImpl_mulx_u64_fallback
1308
1309%endif
1310
1311
1312;;
1313; Macro for implementing a bit operator.
1314;
1315; This will generate code for the 16, 32 and 64 bit accesses with locked
1316; variants, except on 32-bit system where the 64-bit accesses requires hand
1317; coding.
1318;
1319; All the functions takes a pointer to the destination memory operand in A0,
1320; the source register operand in A1 and a pointer to eflags in A2.
1321;
1322; @param 1 The instruction mnemonic.
1323; @param 2 Non-zero if there should be a locked version.
1324; @param 3 The modified flags.
1325; @param 4 The undefined flags.
1326;
1327%macro IEMIMPL_BIT_OP 4
1328BEGINCODE
1329BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1330 PROLOGUE_3_ARGS
1331 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1332 %1 word [A0], A1_16
1333 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1334 EPILOGUE_3_ARGS
1335ENDPROC iemAImpl_ %+ %1 %+ _u16
1336
1337BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1338 PROLOGUE_3_ARGS
1339 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1340 %1 dword [A0], A1_32
1341 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1342 EPILOGUE_3_ARGS
1343ENDPROC iemAImpl_ %+ %1 %+ _u32
1344
1345 %ifdef RT_ARCH_AMD64
1346BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1347 PROLOGUE_3_ARGS
1348 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1349 %1 qword [A0], A1
1350 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1351 EPILOGUE_3_ARGS_EX 8
1352ENDPROC iemAImpl_ %+ %1 %+ _u64
1353 %endif ; RT_ARCH_AMD64
1354
1355 %if %2 != 0 ; locked versions requested?
1356
1357BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1358 PROLOGUE_3_ARGS
1359 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1360 lock %1 word [A0], A1_16
1361 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1362 EPILOGUE_3_ARGS
1363ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1364
1365BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1368 lock %1 dword [A0], A1_32
1369 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1370 EPILOGUE_3_ARGS
1371ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1372
1373 %ifdef RT_ARCH_AMD64
1374BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1375 PROLOGUE_3_ARGS
1376 IEM_MAYBE_LOAD_FLAGS_OLD A2, %3, %4, 0
1377 lock %1 qword [A0], A1
1378 IEM_SAVE_FLAGS_OLD A2, %3, %4, 0
1379 EPILOGUE_3_ARGS_EX 8
1380ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1381 %endif ; RT_ARCH_AMD64
1382 %endif ; locked
1383%endmacro
1384
1385; Undefined flags are passed thru here by the intel and amd CPUs we have.
1386; modified efl, undefined eflags
1387IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1388IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391
1392;;
1393; Macro for implementing a bit search operator.
1394;
1395; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1396; system where the 64-bit accesses requires hand coding.
1397;
1398; All the functions takes a pointer to the destination memory operand in A1,
1399; the source register operand in A2 and the incoming eflags in A0.
1400;
1401; In the ZF case the destination register is 'undefined', however it seems that
1402; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1403; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1404; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1405; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1406;
1407; Intel: Clear all and calculate PF in addition to ZF.
1408; AMD: Passthru all flags other than ZF.
1409;
1410; @param 1 The instruction mnemonic.
1411; @param 2 The modified flags.
1412; @param 3 The undefined flags.
1413; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1414;
1415%macro IEMIMPL_BIT_OP2 4
1416BEGINCODE
1417; 16-bit
1418
1419BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1420 PROLOGUE_3_ARGS
1421 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1422 %1 T0_16, A2_16
1423%if %4 != 0
1424 jz .unchanged_dst
1425%endif
1426 mov [A1], T0_16
1427.unchanged_dst:
1428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1429 EPILOGUE_3_ARGS
1430ENDPROC iemAImpl_ %+ %1 %+ _u16
1431
1432;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1433;bad; PROLOGUE_3_ARGS
1434;bad; %1 T1_16, A1_16
1435;bad; jz .unchanged_dst
1436;bad; mov [A0], T1_16
1437;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1438;bad; EPILOGUE_3_ARGS
1439;bad;.unchanged_dst:
1440;bad;%if %4 != 0
1441;bad; mov [A0], T1_16
1442;bad;%endif
1443;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1444;bad; EPILOGUE_3_ARGS
1445;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1446;bad;
1447;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1448;bad; PROLOGUE_3_ARGS
1449;bad; %1 T0_16, A1_16
1450;bad;%if %4 != 0
1451;bad; jz .unchanged_dst
1452;bad;%endif
1453;bad; mov [A0], T0_16
1454;bad;.unchanged_dst:
1455;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1456;bad; EPILOGUE_3_ARGS
1457;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1458
1459; 32-bit
1460
1461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1462 PROLOGUE_3_ARGS
1463 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1464 %1 T0_32, A2_32
1465%if %4 != 0
1466 jz .unchanged_dst
1467%endif
1468 mov [A1], T0_32
1469.unchanged_dst:
1470 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1471 EPILOGUE_3_ARGS
1472ENDPROC iemAImpl_ %+ %1 %+ _u32
1473
1474;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1475;bad; PROLOGUE_3_ARGS
1476;bad; %1 T1_32, A1_32
1477;bad;%if %4 != 0
1478;bad; jz .unchanged_dst
1479;bad;%endif
1480;bad; mov [A0], T1_32
1481;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1482;bad; EPILOGUE_3_ARGS
1483;bad;.unchanged_dst:
1484;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1485;bad; EPILOGUE_3_ARGS
1486;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1487;bad;
1488;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1489;bad; PROLOGUE_3_ARGS
1490;bad; %1 T0_32, A1_32
1491;bad;%if %4 != 0
1492;bad; jz .unchanged_dst
1493;bad;%endif
1494;bad; mov [A0], T0_32
1495;bad;.unchanged_dst:
1496;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1497;bad; EPILOGUE_3_ARGS
1498;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1499
1500
1501 %ifdef RT_ARCH_AMD64
1502; 64-bit
1503
1504BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1505 PROLOGUE_3_ARGS
1506 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1507 %1 T0, A2
1508%if %4 != 0
1509 jz .unchanged_dst
1510%endif
1511 mov [A1], T0
1512.unchanged_dst:
1513 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1514 EPILOGUE_3_ARGS_EX 8
1515ENDPROC iemAImpl_ %+ %1 %+ _u64
1516
1517;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1518;bad; PROLOGUE_3_ARGS
1519;bad; %1 T1, A1
1520;bad;%if %4 != 0
1521;bad; jz .unchanged_dst
1522;bad;%endif
1523;bad; mov [A0], T1
1524;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1525;bad; EPILOGUE_3_ARGS
1526;bad;.unchanged_dst:
1527;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1528;bad; EPILOGUE_3_ARGS
1529;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1530;bad;
1531;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1532;bad; PROLOGUE_3_ARGS
1533;bad; %1 T0, A1
1534;bad;%if %4 != 0
1535;bad; jz .unchanged_dst
1536;bad;%endif
1537;bad; mov [A0], T0
1538;bad;.unchanged_dst:
1539;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1540;bad; EPILOGUE_3_ARGS_EX 8
1541;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1542
1543 %endif ; RT_ARCH_AMD64
1544%endmacro
1545
1546IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1547IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1549IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550
1551
1552;;
1553; Macro for implementing POPCNT.
1554;
1555; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1556; system where the 64-bit accesses requires hand coding.
1557;
1558; All the functions takes a pointer to the destination memory operand in A1,
1559; the source register operand in A2 and eflags in A0.
1560;
1561; ASSUMES Intel and AMD set EFLAGS the same way.
1562;
1563; ASSUMES the instruction does not support memory destination.
1564;
1565; @param 1 The instruction mnemonic.
1566; @param 2 The modified flags.
1567; @param 3 The undefined flags.
1568; @param 4 The zeroed flags.
1569;
1570%macro IEMIMPL_BIT_OP3 4
1571BEGINCODE
1572BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1573 PROLOGUE_3_ARGS
1574 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1575 %1 T0_16, A2_16
1576 mov [A1], T0_16
1577 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1578 EPILOGUE_3_ARGS
1579ENDPROC iemAImpl_ %+ %1 %+ _u16
1580
1581BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1582 PROLOGUE_3_ARGS
1583 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1584 %1 T0_32, A2_32
1585 mov [A1], T0_32
1586 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1587 EPILOGUE_3_ARGS
1588ENDPROC iemAImpl_ %+ %1 %+ _u32
1589
1590 %ifdef RT_ARCH_AMD64
1591BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1592 PROLOGUE_3_ARGS
1593 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1594 %1 T0, A2
1595 mov [A1], T0
1596 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1597 EPILOGUE_3_ARGS_EX 8
1598ENDPROC iemAImpl_ %+ %1 %+ _u64
1599 %endif ; RT_ARCH_AMD64
1600%endmacro
1601IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1602
1603
1604;
1605; IMUL is also a similar but yet different case (no lock, no mem dst).
1606; The rDX:rAX variant of imul is handled together with mul further down.
1607;
1608BEGINCODE
1609; @param 1 EFLAGS that are modified.
1610; @param 2 Undefined EFLAGS.
1611; @param 3 Function suffix.
1612; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1613; 2 for AMD (set AF, clear PF, ZF and SF).
1614%macro IEMIMPL_IMUL_TWO 4
1615BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1616 PROLOGUE_3_ARGS
1617 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1618 imul A2_16, word [A1]
1619 mov [A1], A2_16
1620 %if %4 != 1
1621 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1622 %else
1623 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1624 %endif
1625 EPILOGUE_3_ARGS
1626ENDPROC iemAImpl_imul_two_u16 %+ %3
1627
1628BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1629 PROLOGUE_3_ARGS
1630 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1631 imul A2_32, dword [A1]
1632 mov [A1], A2_32
1633 %if %4 != 1
1634 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1635 %else
1636 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1637 %endif
1638 EPILOGUE_3_ARGS
1639ENDPROC iemAImpl_imul_two_u32 %+ %3
1640
1641 %ifdef RT_ARCH_AMD64
1642BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1643 PROLOGUE_3_ARGS
1644 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1645 imul A2, qword [A1]
1646 mov [A1], A2
1647 %if %4 != 1
1648 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1649 %else
1650 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1651 %endif
1652 EPILOGUE_3_ARGS_EX 8
1653ENDPROC iemAImpl_imul_two_u64 %+ %3
1654 %endif ; RT_ARCH_AMD64
1655%endmacro
1656; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1657; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1658; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1659IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1662
1663
1664;
1665; XCHG for memory operands. This implies locking. No flag changes.
1666;
1667; Each function takes two arguments, first the pointer to the memory,
1668; then the pointer to the register. They all return void.
1669;
1670BEGINCODE
1671BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1672 PROLOGUE_2_ARGS
1673 mov T0_8, [A1]
1674 xchg [A0], T0_8
1675 mov [A1], T0_8
1676 EPILOGUE_2_ARGS
1677ENDPROC iemAImpl_xchg_u8_locked
1678
1679BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1680 PROLOGUE_2_ARGS
1681 mov T0_16, [A1]
1682 xchg [A0], T0_16
1683 mov [A1], T0_16
1684 EPILOGUE_2_ARGS
1685ENDPROC iemAImpl_xchg_u16_locked
1686
1687BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1688 PROLOGUE_2_ARGS
1689 mov T0_32, [A1]
1690 xchg [A0], T0_32
1691 mov [A1], T0_32
1692 EPILOGUE_2_ARGS
1693ENDPROC iemAImpl_xchg_u32_locked
1694
1695%ifdef RT_ARCH_AMD64
1696BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1697 PROLOGUE_2_ARGS
1698 mov T0, [A1]
1699 xchg [A0], T0
1700 mov [A1], T0
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_xchg_u64_locked
1703%endif
1704
1705; Unlocked variants for fDisregardLock mode.
1706
1707BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1708 PROLOGUE_2_ARGS
1709 mov T0_8, [A1]
1710 mov T1_8, [A0]
1711 mov [A0], T0_8
1712 mov [A1], T1_8
1713 EPILOGUE_2_ARGS
1714ENDPROC iemAImpl_xchg_u8_unlocked
1715
1716BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1717 PROLOGUE_2_ARGS
1718 mov T0_16, [A1]
1719 mov T1_16, [A0]
1720 mov [A0], T0_16
1721 mov [A1], T1_16
1722 EPILOGUE_2_ARGS
1723ENDPROC iemAImpl_xchg_u16_unlocked
1724
1725BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1726 PROLOGUE_2_ARGS
1727 mov T0_32, [A1]
1728 mov T1_32, [A0]
1729 mov [A0], T0_32
1730 mov [A1], T1_32
1731 EPILOGUE_2_ARGS
1732ENDPROC iemAImpl_xchg_u32_unlocked
1733
1734%ifdef RT_ARCH_AMD64
1735BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1736 PROLOGUE_2_ARGS
1737 mov T0, [A1]
1738 mov T1, [A0]
1739 mov [A0], T0
1740 mov [A1], T1
1741 EPILOGUE_2_ARGS
1742ENDPROC iemAImpl_xchg_u64_unlocked
1743%endif
1744
1745
1746;
1747; XADD for memory operands.
1748;
1749; Each function takes three arguments, first the pointer to the
1750; memory/register, then the pointer to the register, and finally a pointer to
1751; eflags. They all return void.
1752;
1753BEGINCODE
1754BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1755 PROLOGUE_3_ARGS
1756 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1757 mov T0_8, [A1]
1758 xadd [A0], T0_8
1759 mov [A1], T0_8
1760 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1761 EPILOGUE_3_ARGS
1762ENDPROC iemAImpl_xadd_u8
1763
1764BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1765 PROLOGUE_3_ARGS
1766 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1767 mov T0_16, [A1]
1768 xadd [A0], T0_16
1769 mov [A1], T0_16
1770 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1771 EPILOGUE_3_ARGS
1772ENDPROC iemAImpl_xadd_u16
1773
1774BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1775 PROLOGUE_3_ARGS
1776 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1777 mov T0_32, [A1]
1778 xadd [A0], T0_32
1779 mov [A1], T0_32
1780 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1781 EPILOGUE_3_ARGS
1782ENDPROC iemAImpl_xadd_u32
1783
1784%ifdef RT_ARCH_AMD64
1785BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1788 mov T0, [A1]
1789 xadd [A0], T0
1790 mov [A1], T0
1791 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1792 EPILOGUE_3_ARGS
1793ENDPROC iemAImpl_xadd_u64
1794%endif ; RT_ARCH_AMD64
1795
1796BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1797 PROLOGUE_3_ARGS
1798 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1799 mov T0_8, [A1]
1800 lock xadd [A0], T0_8
1801 mov [A1], T0_8
1802 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1803 EPILOGUE_3_ARGS
1804ENDPROC iemAImpl_xadd_u8_locked
1805
1806BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1807 PROLOGUE_3_ARGS
1808 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1809 mov T0_16, [A1]
1810 lock xadd [A0], T0_16
1811 mov [A1], T0_16
1812 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1813 EPILOGUE_3_ARGS
1814ENDPROC iemAImpl_xadd_u16_locked
1815
1816BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1817 PROLOGUE_3_ARGS
1818 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1819 mov T0_32, [A1]
1820 lock xadd [A0], T0_32
1821 mov [A1], T0_32
1822 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1823 EPILOGUE_3_ARGS
1824ENDPROC iemAImpl_xadd_u32_locked
1825
1826%ifdef RT_ARCH_AMD64
1827BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1828 PROLOGUE_3_ARGS
1829 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1830 mov T0, [A1]
1831 lock xadd [A0], T0
1832 mov [A1], T0
1833 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1834 EPILOGUE_3_ARGS
1835ENDPROC iemAImpl_xadd_u64_locked
1836%endif ; RT_ARCH_AMD64
1837
1838
1839;
1840; CMPXCHG8B.
1841;
1842; These are tricky register wise, so the code is duplicated for each calling
1843; convention.
1844;
1845; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1846;
1847; C-proto:
1848; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1849; uint32_t *pEFlags));
1850;
1851; Note! Identical to iemAImpl_cmpxchg16b.
1852;
1853BEGINCODE
1854BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1855%ifdef RT_ARCH_AMD64
1856 %ifdef ASM_CALL64_MSC
1857 push rbx
1858
1859 mov r11, rdx ; pu64EaxEdx (is also T1)
1860 mov r10, rcx ; pu64Dst
1861
1862 mov ebx, [r8]
1863 mov ecx, [r8 + 4]
1864 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1865 mov eax, [r11]
1866 mov edx, [r11 + 4]
1867
1868 cmpxchg8b [r10]
1869
1870 mov [r11], eax
1871 mov [r11 + 4], edx
1872 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1873
1874 pop rbx
1875 ret
1876 %else
1877 push rbx
1878
1879 mov r10, rcx ; pEFlags
1880 mov r11, rdx ; pu64EbxEcx (is also T1)
1881
1882 mov ebx, [r11]
1883 mov ecx, [r11 + 4]
1884 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1885 mov eax, [rsi]
1886 mov edx, [rsi + 4]
1887
1888 cmpxchg8b [rdi]
1889
1890 mov [rsi], eax
1891 mov [rsi + 4], edx
1892 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1893
1894 pop rbx
1895 ret
1896
1897 %endif
1898%else
1899 push esi
1900 push edi
1901 push ebx
1902 push ebp
1903
1904 mov edi, ecx ; pu64Dst
1905 mov esi, edx ; pu64EaxEdx
1906 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1907 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1908
1909 mov ebx, [ecx]
1910 mov ecx, [ecx + 4]
1911 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1912 mov eax, [esi]
1913 mov edx, [esi + 4]
1914
1915 cmpxchg8b [edi]
1916
1917 mov [esi], eax
1918 mov [esi + 4], edx
1919 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1920
1921 pop ebp
1922 pop ebx
1923 pop edi
1924 pop esi
1925 ret 8
1926%endif
1927ENDPROC iemAImpl_cmpxchg8b
1928
1929BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1930%ifdef RT_ARCH_AMD64
1931 %ifdef ASM_CALL64_MSC
1932 push rbx
1933
1934 mov r11, rdx ; pu64EaxEdx (is also T1)
1935 mov r10, rcx ; pu64Dst
1936
1937 mov ebx, [r8]
1938 mov ecx, [r8 + 4]
1939 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1940 mov eax, [r11]
1941 mov edx, [r11 + 4]
1942
1943 lock cmpxchg8b [r10]
1944
1945 mov [r11], eax
1946 mov [r11 + 4], edx
1947 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1948
1949 pop rbx
1950 ret
1951 %else
1952 push rbx
1953
1954 mov r10, rcx ; pEFlags
1955 mov r11, rdx ; pu64EbxEcx (is also T1)
1956
1957 mov ebx, [r11]
1958 mov ecx, [r11 + 4]
1959 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1960 mov eax, [rsi]
1961 mov edx, [rsi + 4]
1962
1963 lock cmpxchg8b [rdi]
1964
1965 mov [rsi], eax
1966 mov [rsi + 4], edx
1967 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1968
1969 pop rbx
1970 ret
1971
1972 %endif
1973%else
1974 push esi
1975 push edi
1976 push ebx
1977 push ebp
1978
1979 mov edi, ecx ; pu64Dst
1980 mov esi, edx ; pu64EaxEdx
1981 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1982 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1983
1984 mov ebx, [ecx]
1985 mov ecx, [ecx + 4]
1986 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1987 mov eax, [esi]
1988 mov edx, [esi + 4]
1989
1990 lock cmpxchg8b [edi]
1991
1992 mov [esi], eax
1993 mov [esi + 4], edx
1994 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1995
1996 pop ebp
1997 pop ebx
1998 pop edi
1999 pop esi
2000 ret 8
2001%endif
2002ENDPROC iemAImpl_cmpxchg8b_locked
2003
2004%ifdef RT_ARCH_AMD64
2005
2006;
2007; CMPXCHG16B.
2008;
2009; These are tricky register wise, so the code is duplicated for each calling
2010; convention.
2011;
2012; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2013;
2014; C-proto:
2015; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2016; uint32_t *pEFlags));
2017;
2018; Note! Identical to iemAImpl_cmpxchg8b.
2019;
2020BEGINCODE
2021BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2022 %ifdef ASM_CALL64_MSC
2023 push rbx
2024
2025 mov r11, rdx ; pu64RaxRdx (is also T1)
2026 mov r10, rcx ; pu64Dst
2027
2028 mov rbx, [r8]
2029 mov rcx, [r8 + 8]
2030 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2031 mov rax, [r11]
2032 mov rdx, [r11 + 8]
2033
2034 cmpxchg16b [r10]
2035
2036 mov [r11], rax
2037 mov [r11 + 8], rdx
2038 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2039
2040 pop rbx
2041 ret
2042 %else
2043 push rbx
2044
2045 mov r10, rcx ; pEFlags
2046 mov r11, rdx ; pu64RbxRcx (is also T1)
2047
2048 mov rbx, [r11]
2049 mov rcx, [r11 + 8]
2050 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2051 mov rax, [rsi]
2052 mov rdx, [rsi + 8]
2053
2054 cmpxchg16b [rdi]
2055
2056 mov [rsi], rax
2057 mov [rsi + 8], rdx
2058 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2059
2060 pop rbx
2061 ret
2062
2063 %endif
2064ENDPROC iemAImpl_cmpxchg16b
2065
2066BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2067 %ifdef ASM_CALL64_MSC
2068 push rbx
2069
2070 mov r11, rdx ; pu64RaxRdx (is also T1)
2071 mov r10, rcx ; pu64Dst
2072
2073 mov rbx, [r8]
2074 mov rcx, [r8 + 8]
2075 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2076 mov rax, [r11]
2077 mov rdx, [r11 + 8]
2078
2079 lock cmpxchg16b [r10]
2080
2081 mov [r11], rax
2082 mov [r11 + 8], rdx
2083 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2084
2085 pop rbx
2086 ret
2087 %else
2088 push rbx
2089
2090 mov r10, rcx ; pEFlags
2091 mov r11, rdx ; pu64RbxRcx (is also T1)
2092
2093 mov rbx, [r11]
2094 mov rcx, [r11 + 8]
2095 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2096 mov rax, [rsi]
2097 mov rdx, [rsi + 8]
2098
2099 lock cmpxchg16b [rdi]
2100
2101 mov [rsi], rax
2102 mov [rsi + 8], rdx
2103 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2104
2105 pop rbx
2106 ret
2107
2108 %endif
2109ENDPROC iemAImpl_cmpxchg16b_locked
2110
2111%endif ; RT_ARCH_AMD64
2112
2113
2114;
2115; CMPXCHG.
2116;
2117; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2118;
2119; C-proto:
2120; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2121;
2122BEGINCODE
2123%macro IEMIMPL_CMPXCHG 2
2124BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2125 PROLOGUE_4_ARGS
2126 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2127 mov al, [A1]
2128 %1 cmpxchg [A0], A2_8
2129 mov [A1], al
2130 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2131 EPILOGUE_4_ARGS
2132ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2133
2134BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2135 PROLOGUE_4_ARGS
2136 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2137 mov ax, [A1]
2138 %1 cmpxchg [A0], A2_16
2139 mov [A1], ax
2140 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2141 EPILOGUE_4_ARGS
2142ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2143
2144BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2145 PROLOGUE_4_ARGS
2146 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2147 mov eax, [A1]
2148 %1 cmpxchg [A0], A2_32
2149 mov [A1], eax
2150 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2151 EPILOGUE_4_ARGS
2152ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2153
2154BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2155%ifdef RT_ARCH_AMD64
2156 PROLOGUE_4_ARGS
2157 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2158 mov rax, [A1]
2159 %1 cmpxchg [A0], A2
2160 mov [A1], rax
2161 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2162 EPILOGUE_4_ARGS
2163%else
2164 ;
2165 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2166 ;
2167 push esi
2168 push edi
2169 push ebx
2170 push ebp
2171
2172 mov edi, ecx ; pu64Dst
2173 mov esi, edx ; pu64Rax
2174 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2175 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2176
2177 mov ebx, [ecx]
2178 mov ecx, [ecx + 4]
2179 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2180 mov eax, [esi]
2181 mov edx, [esi + 4]
2182
2183 lock cmpxchg8b [edi]
2184
2185 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2186 jz .cmpxchg8b_not_equal
2187;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2188 cmp eax, eax ; just set the other flags.
2189.store:
2190 mov [esi], eax
2191 mov [esi + 4], edx
2192 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2193
2194 pop ebp
2195 pop ebx
2196 pop edi
2197 pop esi
2198 ret 8
2199
2200.cmpxchg8b_not_equal:
2201 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2202 jne .store
2203 cmp [esi], eax
2204 jmp .store
2205
2206%endif
2207ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2208%endmacro ; IEMIMPL_CMPXCHG
2209
2210IEMIMPL_CMPXCHG , ,
2211IEMIMPL_CMPXCHG lock, _locked
2212
2213
2214
2215;;
2216; Macro for implementing a unary operator.
2217;
2218; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2219; variants, except on 32-bit system where the 64-bit accesses requires hand
2220; coding.
2221;
2222; All the functions takes a pointer to the destination memory operand in A0,
2223; the source register operand in A1 and a pointer to eflags in A2.
2224;
2225; @param 1 The instruction mnemonic.
2226; @param 2 The modified flags.
2227; @param 3 The undefined flags.
2228;
2229%macro IEMIMPL_UNARY_OP 3
2230BEGINCODE
2231BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2232 PROLOGUE_2_ARGS
2233 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2234 %1 byte [A0]
2235 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2236 EPILOGUE_2_ARGS
2237ENDPROC iemAImpl_ %+ %1 %+ _u8
2238
2239BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2240 PROLOGUE_2_ARGS
2241 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2242 lock %1 byte [A0]
2243 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2244 EPILOGUE_2_ARGS
2245ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2246
2247BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2248 PROLOGUE_2_ARGS
2249 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2250 %1 word [A0]
2251 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2252 EPILOGUE_2_ARGS
2253ENDPROC iemAImpl_ %+ %1 %+ _u16
2254
2255BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2256 PROLOGUE_2_ARGS
2257 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2258 lock %1 word [A0]
2259 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2260 EPILOGUE_2_ARGS
2261ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2262
2263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2264 PROLOGUE_2_ARGS
2265 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2266 %1 dword [A0]
2267 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2268 EPILOGUE_2_ARGS
2269ENDPROC iemAImpl_ %+ %1 %+ _u32
2270
2271BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2272 PROLOGUE_2_ARGS
2273 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2274 lock %1 dword [A0]
2275 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2276 EPILOGUE_2_ARGS
2277ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2278
2279 %ifdef RT_ARCH_AMD64
2280BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2281 PROLOGUE_2_ARGS
2282 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2283 %1 qword [A0]
2284 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2285 EPILOGUE_2_ARGS
2286ENDPROC iemAImpl_ %+ %1 %+ _u64
2287
2288BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2289 PROLOGUE_2_ARGS
2290 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2291 lock %1 qword [A0]
2292 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2293 EPILOGUE_2_ARGS
2294ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2295 %endif ; RT_ARCH_AMD64
2296
2297%endmacro
2298
2299IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2300IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2302IEMIMPL_UNARY_OP not, 0, 0
2303
2304
2305;
2306; BSWAP. No flag changes.
2307;
2308; Each function takes one argument, pointer to the value to bswap
2309; (input/output). They all return void.
2310;
2311BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2312 PROLOGUE_1_ARGS
2313 mov T0_32, [A0] ; just in case any of the upper bits are used.
2314 db 66h
2315 bswap T0_32
2316 mov [A0], T0_32
2317 EPILOGUE_1_ARGS
2318ENDPROC iemAImpl_bswap_u16
2319
2320BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2321 PROLOGUE_1_ARGS
2322 mov T0_32, [A0]
2323 bswap T0_32
2324 mov [A0], T0_32
2325 EPILOGUE_1_ARGS
2326ENDPROC iemAImpl_bswap_u32
2327
2328BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2329%ifdef RT_ARCH_AMD64
2330 PROLOGUE_1_ARGS
2331 mov T0, [A0]
2332 bswap T0
2333 mov [A0], T0
2334 EPILOGUE_1_ARGS
2335%else
2336 PROLOGUE_1_ARGS
2337 mov T0, [A0]
2338 mov T1, [A0 + 4]
2339 bswap T0
2340 bswap T1
2341 mov [A0 + 4], T0
2342 mov [A0], T1
2343 EPILOGUE_1_ARGS
2344%endif
2345ENDPROC iemAImpl_bswap_u64
2346
2347
2348;;
2349; Macro for implementing a shift operation.
2350;
2351; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2352; 32-bit system where the 64-bit accesses requires hand coding.
2353;
2354; All the functions takes a pointer to the destination memory operand in A0,
2355; the shift count in A1 and a pointer to eflags in A2.
2356;
2357; @param 1 The instruction mnemonic.
2358; @param 2 The modified flags.
2359; @param 3 The undefined flags.
2360; @param 4 Force load flags.
2361;
2362; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
2363;
2364; @note the _intel and _amd variants are implemented in C.
2365;
2366%macro IEMIMPL_SHIFT_OP 4
2367BEGINCODE
2368BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2369 PROLOGUE_3_ARGS
2370 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %4
2371 %ifdef ASM_CALL64_GCC
2372 mov cl, A1_8
2373 %1 byte [A0], cl
2374 %else
2375 xchg A1, A0
2376 %1 byte [A1], cl
2377 %endif
2378 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2379.zero_shift:
2380 EPILOGUE_3_ARGS
2381ENDPROC iemAImpl_ %+ %1 %+ _u8
2382
2383BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2384 PROLOGUE_3_ARGS
2385 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %4
2386 %ifdef ASM_CALL64_GCC
2387 mov cl, A1_8
2388 %1 word [A0], cl
2389 %else
2390 xchg A1, A0
2391 %1 word [A1], cl
2392 %endif
2393 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2394 EPILOGUE_3_ARGS
2395ENDPROC iemAImpl_ %+ %1 %+ _u16
2396
2397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2398 PROLOGUE_3_ARGS
2399 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %4
2400 %ifdef ASM_CALL64_GCC
2401 mov cl, A1_8
2402 %1 dword [A0], cl
2403 %else
2404 xchg A1, A0
2405 %1 dword [A1], cl
2406 %endif
2407 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2408 EPILOGUE_3_ARGS
2409ENDPROC iemAImpl_ %+ %1 %+ _u32
2410
2411 %ifdef RT_ARCH_AMD64
2412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2413 PROLOGUE_3_ARGS
2414 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %4
2415 %ifdef ASM_CALL64_GCC
2416 mov cl, A1_8
2417 %1 qword [A0], cl
2418 %else
2419 xchg A1, A0
2420 %1 qword [A1], cl
2421 %endif
2422 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2423 EPILOGUE_3_ARGS
2424ENDPROC iemAImpl_ %+ %1 %+ _u64
2425 %endif ; RT_ARCH_AMD64
2426
2427%endmacro
2428
2429; These instructions will NOT modify flags if the masked shift count is zero
2430; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2431; we have to force load all modified and undefined.
2432IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2433IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2434IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2435IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2436IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2437IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2438IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2439
2440
2441;;
2442; Macro for implementing a double precision shift operation.
2443;
2444; This will generate code for the 16, 32 and 64 bit accesses, except on
2445; 32-bit system where the 64-bit accesses requires hand coding.
2446;
2447; The functions takes the destination operand (r/m) in A0, the source (reg) in
2448; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2449;
2450; @param 1 The instruction mnemonic.
2451; @param 2 The modified flags.
2452; @param 3 The undefined flags.
2453; @param 4 The force loaded flags.
2454;
2455; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2456;
2457; @note the _intel and _amd variants are implemented in C.
2458;
2459%macro IEMIMPL_SHIFT_DBL_OP 4
2460BEGINCODE
2461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2462 PROLOGUE_4_ARGS
2463 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2464 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2465 %ifdef ASM_CALL64_GCC
2466 xchg A3, A2
2467 %1 [A0], A1_16, cl
2468 xchg A3, A2
2469 %else
2470 xchg A0, A2
2471 %1 [A2], A1_16, cl
2472 %endif
2473 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2474 EPILOGUE_4_ARGS
2475ENDPROC iemAImpl_ %+ %1 %+ _u16
2476
2477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2478 PROLOGUE_4_ARGS
2479 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2480 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2481 %ifdef ASM_CALL64_GCC
2482 xchg A3, A2
2483 %1 [A0], A1_32, cl
2484 xchg A3, A2
2485 %else
2486 xchg A0, A2
2487 %1 [A2], A1_32, cl
2488 %endif
2489 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2490 EPILOGUE_4_ARGS
2491ENDPROC iemAImpl_ %+ %1 %+ _u32
2492
2493 %ifdef RT_ARCH_AMD64
2494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2495 PROLOGUE_4_ARGS
2496 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2497 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2498 %ifdef ASM_CALL64_GCC
2499 xchg A3, A2
2500 %1 [A0], A1, cl
2501 xchg A3, A2
2502 %else
2503 xchg A0, A2
2504 %1 [A2], A1, cl
2505 %endif
2506 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2507 EPILOGUE_4_ARGS_EX 12
2508ENDPROC iemAImpl_ %+ %1 %+ _u64
2509 %endif ; RT_ARCH_AMD64
2510
2511%endmacro
2512
2513; These instructions will NOT modify flags if the masked shift count is zero
2514; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2515; we have to force load all modified and undefined.
2516IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2517IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2518
2519
2520;;
2521; Macro for implementing a multiplication operations.
2522;
2523; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2524; 32-bit system where the 64-bit accesses requires hand coding.
2525;
2526; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2527; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2528; pointer to eflags in A3.
2529;
2530; The functions all return 0 so the caller can be used for div/idiv as well as
2531; for the mul/imul implementation.
2532;
2533; @param 1 The instruction mnemonic.
2534; @param 2 The modified flags.
2535; @param 3 The undefined flags.
2536; @param 4 Name suffix.
2537; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2538;
2539; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2540;
2541%macro IEMIMPL_MUL_OP 5
2542BEGINCODE
2543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2544 PROLOGUE_3_ARGS
2545 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2546 mov al, [A0]
2547 %1 A1_8
2548 mov [A0], ax
2549 %if %5 != 1
2550 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2551 %else
2552 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2553 %endif
2554 xor eax, eax
2555 EPILOGUE_3_ARGS
2556ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2557
2558BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2559 PROLOGUE_4_ARGS
2560 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2561 mov ax, [A0]
2562 %ifdef ASM_CALL64_GCC
2563 %1 A2_16
2564 mov [A0], ax
2565 mov [A1], dx
2566 %else
2567 mov T1, A1
2568 %1 A2_16
2569 mov [A0], ax
2570 mov [T1], dx
2571 %endif
2572 %if %5 != 1
2573 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2574 %else
2575 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2576 %endif
2577 xor eax, eax
2578 EPILOGUE_4_ARGS
2579ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2580
2581BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2582 PROLOGUE_4_ARGS
2583 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2584 mov eax, [A0]
2585 %ifdef ASM_CALL64_GCC
2586 %1 A2_32
2587 mov [A0], eax
2588 mov [A1], edx
2589 %else
2590 mov T1, A1
2591 %1 A2_32
2592 mov [A0], eax
2593 mov [T1], edx
2594 %endif
2595 %if %5 != 1
2596 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2597 %else
2598 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2599 %endif
2600 xor eax, eax
2601 EPILOGUE_4_ARGS
2602ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2603
2604 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2606 PROLOGUE_4_ARGS
2607 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2608 mov rax, [A0]
2609 %ifdef ASM_CALL64_GCC
2610 %1 A2
2611 mov [A0], rax
2612 mov [A1], rdx
2613 %else
2614 mov T1, A1
2615 %1 A2
2616 mov [A0], rax
2617 mov [T1], rdx
2618 %endif
2619 %if %5 != 1
2620 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2621 %else
2622 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2623 %endif
2624 xor eax, eax
2625 EPILOGUE_4_ARGS_EX 12
2626ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2627 %endif ; !RT_ARCH_AMD64
2628
2629%endmacro
2630
2631IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2632IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2633IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2634IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2635IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2636IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2637
2638
2639BEGINCODE
2640;;
2641; Worker function for negating a 32-bit number in T1:T0
2642; @uses None (T0,T1)
2643BEGINPROC iemAImpl_negate_T0_T1_u32
2644 push 0
2645 push 0
2646 xchg T0_32, [xSP]
2647 xchg T1_32, [xSP + xCB]
2648 sub T0_32, [xSP]
2649 sbb T1_32, [xSP + xCB]
2650 add xSP, xCB*2
2651 ret
2652ENDPROC iemAImpl_negate_T0_T1_u32
2653
2654%ifdef RT_ARCH_AMD64
2655;;
2656; Worker function for negating a 64-bit number in T1:T0
2657; @uses None (T0,T1)
2658BEGINPROC iemAImpl_negate_T0_T1_u64
2659 push 0
2660 push 0
2661 xchg T0, [xSP]
2662 xchg T1, [xSP + xCB]
2663 sub T0, [xSP]
2664 sbb T1, [xSP + xCB]
2665 add xSP, xCB*2
2666 ret
2667ENDPROC iemAImpl_negate_T0_T1_u64
2668%endif
2669
2670
2671;;
2672; Macro for implementing a division operations.
2673;
2674; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2675; 32-bit system where the 64-bit accesses requires hand coding.
2676;
2677; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2678; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2679; pointer to eflags in A3.
2680;
2681; The functions all return 0 on success and -1 if a divide error should be
2682; raised by the caller.
2683;
2684; @param 1 The instruction mnemonic.
2685; @param 2 The modified flags.
2686; @param 3 The undefined flags.
2687; @param 4 1 if signed, 0 if unsigned.
2688; @param 5 Function suffix.
2689; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2690; 2 for AMD (set AF, clear PF, ZF and SF).
2691;
2692; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2693;
2694%macro IEMIMPL_DIV_OP 6
2695BEGINCODE
2696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2697 PROLOGUE_3_ARGS
2698
2699 ; div by chainsaw check.
2700 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2701 jz .div_zero
2702
2703 ; Overflow check - unsigned division is simple to verify, haven't
2704 ; found a simple way to check signed division yet unfortunately.
2705 %if %4 == 0
2706 cmp [A0 + 1], A1_8
2707 jae .div_overflow
2708 %else
2709 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2710 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2711 test A1_8, A1_8
2712 js .divisor_negative
2713 test T0_16, T0_16
2714 jns .both_positive
2715 neg T0_16
2716.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2717 push T0 ; Start off like unsigned below.
2718 shr T0_16, 7
2719 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2720 pop T0
2721 jb .div_no_overflow
2722 ja .div_overflow
2723 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2724 cmp T0_8, A1_8
2725 jae .div_overflow
2726 jmp .div_no_overflow
2727
2728.divisor_negative:
2729 neg A1_8
2730 test T0_16, T0_16
2731 jns .one_of_each
2732 neg T0_16
2733.both_positive: ; Same as unsigned shifted by sign indicator bit.
2734 shr T0_16, 7
2735 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2736 jae .div_overflow
2737.div_no_overflow:
2738 mov A1, T1 ; restore divisor
2739 %endif
2740
2741 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2742 mov ax, [A0]
2743 %1 A1_8
2744 mov [A0], ax
2745 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2746 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2747 %else
2748 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2749 %endif
2750 xor eax, eax
2751
2752.return:
2753 EPILOGUE_3_ARGS
2754
2755.div_zero:
2756.div_overflow:
2757 mov eax, -1
2758 jmp .return
2759ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2760
2761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2762 PROLOGUE_4_ARGS
2763
2764 ; div by chainsaw check.
2765 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2766 jz .div_zero
2767
2768 ; Overflow check - unsigned division is simple to verify, haven't
2769 ; found a simple way to check signed division yet unfortunately.
2770 %if %4 == 0
2771 cmp [A1], A2_16
2772 jae .div_overflow
2773 %else
2774 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2775 shl T0_32, 16
2776 mov T0_16, [A0] ; T0 = dividend
2777 mov T1, A2 ; T1 = divisor
2778 test T1_16, T1_16
2779 js .divisor_negative
2780 test T0_32, T0_32
2781 jns .both_positive
2782 neg T0_32
2783.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2784 push T0 ; Start off like unsigned below.
2785 shr T0_32, 15
2786 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2787 pop T0
2788 jb .div_no_overflow
2789 ja .div_overflow
2790 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2791 cmp T0_16, T1_16
2792 jae .div_overflow
2793 jmp .div_no_overflow
2794
2795.divisor_negative:
2796 neg T1_16
2797 test T0_32, T0_32
2798 jns .one_of_each
2799 neg T0_32
2800.both_positive: ; Same as unsigned shifted by sign indicator bit.
2801 shr T0_32, 15
2802 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2803 jae .div_overflow
2804.div_no_overflow:
2805 %endif
2806
2807 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2808 %ifdef ASM_CALL64_GCC
2809 mov T1, A2
2810 mov ax, [A0]
2811 mov dx, [A1]
2812 %1 T1_16
2813 mov [A0], ax
2814 mov [A1], dx
2815 %else
2816 mov T1, A1
2817 mov ax, [A0]
2818 mov dx, [T1]
2819 %1 A2_16
2820 mov [A0], ax
2821 mov [T1], dx
2822 %endif
2823 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2824 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2825 %else
2826 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2827 %endif
2828 xor eax, eax
2829
2830.return:
2831 EPILOGUE_4_ARGS
2832
2833.div_zero:
2834.div_overflow:
2835 mov eax, -1
2836 jmp .return
2837ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2838
2839BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2840 PROLOGUE_4_ARGS
2841
2842 ; div by chainsaw check.
2843 test A2_32, A2_32
2844 jz .div_zero
2845
2846 ; Overflow check - unsigned division is simple to verify, haven't
2847 ; found a simple way to check signed division yet unfortunately.
2848 %if %4 == 0
2849 cmp [A1], A2_32
2850 jae .div_overflow
2851 %else
2852 push A2 ; save A2 so we modify it (we out of regs on x86).
2853 mov T0_32, [A0] ; T0 = dividend low
2854 mov T1_32, [A1] ; T1 = dividend high
2855 ;test A2_32, A2_32 - we did this 5 instructions ago.
2856 js .divisor_negative
2857 test T1_32, T1_32
2858 jns .both_positive
2859 call NAME(iemAImpl_negate_T0_T1_u32)
2860.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2861 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2862 jnz .div_overflow
2863 push T0 ; Start off like unsigned below.
2864 shl T1_32, 1
2865 shr T0_32, 31
2866 or T1_32, T0_32
2867 cmp T1_32, A2_32
2868 pop T0
2869 jb .div_no_overflow
2870 ja .div_overflow
2871 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2872 cmp T0_32, A2_32
2873 jae .div_overflow
2874 jmp .div_no_overflow
2875
2876.divisor_negative:
2877 neg A2_32
2878 test T1_32, T1_32
2879 jns .one_of_each
2880 call NAME(iemAImpl_negate_T0_T1_u32)
2881.both_positive: ; Same as unsigned shifted by sign indicator bit.
2882 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2883 jnz .div_overflow
2884 shl T1_32, 1
2885 shr T0_32, 31
2886 or T1_32, T0_32
2887 cmp T1_32, A2_32
2888 jae .div_overflow
2889.div_no_overflow:
2890 pop A2
2891 %endif
2892
2893 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2894 mov eax, [A0]
2895 %ifdef ASM_CALL64_GCC
2896 mov T1, A2
2897 mov eax, [A0]
2898 mov edx, [A1]
2899 %1 T1_32
2900 mov [A0], eax
2901 mov [A1], edx
2902 %else
2903 mov T1, A1
2904 mov eax, [A0]
2905 mov edx, [T1]
2906 %1 A2_32
2907 mov [A0], eax
2908 mov [T1], edx
2909 %endif
2910 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2911 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2912 %else
2913 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2914 %endif
2915 xor eax, eax
2916
2917.return:
2918 EPILOGUE_4_ARGS
2919
2920.div_overflow:
2921 %if %4 != 0
2922 pop A2
2923 %endif
2924.div_zero:
2925 mov eax, -1
2926 jmp .return
2927ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2928
2929 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2931 PROLOGUE_4_ARGS
2932
2933 test A2, A2
2934 jz .div_zero
2935 %if %4 == 0
2936 cmp [A1], A2
2937 jae .div_overflow
2938 %else
2939 push A2 ; save A2 so we modify it (we out of regs on x86).
2940 mov T0, [A0] ; T0 = dividend low
2941 mov T1, [A1] ; T1 = dividend high
2942 ;test A2, A2 - we did this five instructions above.
2943 js .divisor_negative
2944 test T1, T1
2945 jns .both_positive
2946 call NAME(iemAImpl_negate_T0_T1_u64)
2947.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2948 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2949 jc .div_overflow
2950 push T0 ; Start off like unsigned below.
2951 shl T1, 1
2952 shr T0, 63
2953 or T1, T0
2954 cmp T1, A2
2955 pop T0
2956 jb .div_no_overflow
2957 ja .div_overflow
2958 mov T1, 0x7fffffffffffffff
2959 and T0, T1 ; Special case for covering (divisor - 1).
2960 cmp T0, A2
2961 jae .div_overflow
2962 jmp .div_no_overflow
2963
2964.divisor_negative:
2965 neg A2
2966 test T1, T1
2967 jns .one_of_each
2968 call NAME(iemAImpl_negate_T0_T1_u64)
2969.both_positive: ; Same as unsigned shifted by sign indicator bit.
2970 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2971 jc .div_overflow
2972 shl T1, 1
2973 shr T0, 63
2974 or T1, T0
2975 cmp T1, A2
2976 jae .div_overflow
2977.div_no_overflow:
2978 pop A2
2979 %endif
2980
2981 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2982 mov rax, [A0]
2983 %ifdef ASM_CALL64_GCC
2984 mov T1, A2
2985 mov rax, [A0]
2986 mov rdx, [A1]
2987 %1 T1
2988 mov [A0], rax
2989 mov [A1], rdx
2990 %else
2991 mov T1, A1
2992 mov rax, [A0]
2993 mov rdx, [T1]
2994 %1 A2
2995 mov [A0], rax
2996 mov [T1], rdx
2997 %endif
2998 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2999 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3000 %else
3001 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3002 %endif
3003 xor eax, eax
3004
3005.return:
3006 EPILOGUE_4_ARGS_EX 12
3007
3008.div_overflow:
3009 %if %4 != 0
3010 pop A2
3011 %endif
3012.div_zero:
3013 mov eax, -1
3014 jmp .return
3015ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3016 %endif ; !RT_ARCH_AMD64
3017
3018%endmacro
3019
3020IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3021IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3022IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3023;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3024IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3025IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3026IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3027
3028
3029;;
3030; Macro for implementing memory fence operation.
3031;
3032; No return value, no operands or anything.
3033;
3034; @param 1 The instruction.
3035;
3036%macro IEMIMPL_MEM_FENCE 1
3037BEGINCODE
3038BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3039 %1
3040 ret
3041ENDPROC iemAImpl_ %+ %1
3042%endmacro
3043
3044IEMIMPL_MEM_FENCE lfence
3045IEMIMPL_MEM_FENCE sfence
3046IEMIMPL_MEM_FENCE mfence
3047
3048;;
3049; Alternative for non-SSE2 host.
3050;
3051BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3052 push xAX
3053 xchg xAX, [xSP]
3054 add xSP, xCB
3055 ret
3056ENDPROC iemAImpl_alt_mem_fence
3057
3058
3059;;
3060; Initialize the FPU for the actual instruction being emulated, this means
3061; loading parts of the guest's control word and status word.
3062;
3063; @uses 24 bytes of stack. T0, T1
3064; @param 1 Expression giving the address of the FXSTATE of the guest.
3065;
3066%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3067 fnstenv [xSP]
3068
3069 ; FCW - for exception, precision and rounding control.
3070 movzx T0, word [%1 + X86FXSTATE.FCW]
3071 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3072 mov [xSP + X86FSTENV32P.FCW], T0_16
3073
3074 ; FSW - for undefined C0, C1, C2, and C3.
3075 movzx T1, word [%1 + X86FXSTATE.FSW]
3076 and T1, X86_FSW_C_MASK
3077 movzx T0, word [xSP + X86FSTENV32P.FSW]
3078 and T0, X86_FSW_TOP_MASK
3079 or T0, T1
3080 mov [xSP + X86FSTENV32P.FSW], T0_16
3081
3082 fldenv [xSP]
3083%endmacro
3084
3085
3086;;
3087; Initialize the FPU for the actual instruction being emulated, this means
3088; loading parts of the guest's control word, status word, and update the
3089; tag word for the top register if it's empty.
3090;
3091; ASSUMES actual TOP=7
3092;
3093; @uses 24 bytes of stack. T0, T1
3094; @param 1 Expression giving the address of the FXSTATE of the guest.
3095;
3096%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3097 fnstenv [xSP]
3098
3099 ; FCW - for exception, precision and rounding control.
3100 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3101 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3102 mov [xSP + X86FSTENV32P.FCW], T0_16
3103
3104 ; FSW - for undefined C0, C1, C2, and C3.
3105 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3106 and T1_32, X86_FSW_C_MASK
3107 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3108 and T0_32, X86_FSW_TOP_MASK
3109 or T0_32, T1_32
3110 mov [xSP + X86FSTENV32P.FSW], T0_16
3111
3112 ; FTW - Only for ST0 (in/out).
3113 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3114 shr T1_32, X86_FSW_TOP_SHIFT
3115 and T1_32, X86_FSW_TOP_SMASK
3116 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3117 jc %%st0_not_empty
3118 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3119%%st0_not_empty:
3120
3121 fldenv [xSP]
3122%endmacro
3123
3124
3125;;
3126; Need to move this as well somewhere better?
3127;
3128struc IEMFPURESULT
3129 .r80Result resw 5
3130 .FSW resw 1
3131endstruc
3132
3133
3134;;
3135; Need to move this as well somewhere better?
3136;
3137struc IEMFPURESULTTWO
3138 .r80Result1 resw 5
3139 .FSW resw 1
3140 .r80Result2 resw 5
3141endstruc
3142
3143
3144;
3145;---------------------- 16-bit signed integer operations ----------------------
3146;
3147
3148
3149;;
3150; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3151;
3152; @param A0 FPU context (fxsave).
3153; @param A1 Pointer to a IEMFPURESULT for the output.
3154; @param A2 Pointer to the 16-bit floating point value to convert.
3155;
3156BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3157 PROLOGUE_3_ARGS
3158 sub xSP, 20h
3159
3160 fninit
3161 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3162 fild word [A2]
3163
3164 fnstsw word [A1 + IEMFPURESULT.FSW]
3165 fnclex
3166 fstp tword [A1 + IEMFPURESULT.r80Result]
3167
3168 fninit
3169 add xSP, 20h
3170 EPILOGUE_3_ARGS
3171ENDPROC iemAImpl_fild_r80_from_i16
3172
3173
3174;;
3175; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3176;
3177; @param A0 FPU context (fxsave).
3178; @param A1 Where to return the output FSW.
3179; @param A2 Where to store the 16-bit signed integer value.
3180; @param A3 Pointer to the 80-bit value.
3181;
3182BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3183 PROLOGUE_4_ARGS
3184 sub xSP, 20h
3185
3186 fninit
3187 fld tword [A3]
3188 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3189 fistp word [A2]
3190
3191 fnstsw word [A1]
3192
3193 fninit
3194 add xSP, 20h
3195 EPILOGUE_4_ARGS
3196ENDPROC iemAImpl_fist_r80_to_i16
3197
3198
3199;;
3200; Store a 80-bit floating point value (register) as a 16-bit signed integer
3201; (memory) with truncation.
3202;
3203; @param A0 FPU context (fxsave).
3204; @param A1 Where to return the output FSW.
3205; @param A2 Where to store the 16-bit signed integer value.
3206; @param A3 Pointer to the 80-bit value.
3207;
3208BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3209 PROLOGUE_4_ARGS
3210 sub xSP, 20h
3211
3212 fninit
3213 fld tword [A3]
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fisttp word [A2]
3216
3217 fnstsw word [A1]
3218
3219 fninit
3220 add xSP, 20h
3221 EPILOGUE_4_ARGS
3222ENDPROC iemAImpl_fistt_r80_to_i16
3223
3224
3225;;
3226; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3227;
3228; @param 1 The instruction
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Pointer to a IEMFPURESULT for the output.
3232; @param A2 Pointer to the 80-bit value.
3233; @param A3 Pointer to the 16-bit value.
3234;
3235%macro IEMIMPL_FPU_R80_BY_I16 1
3236BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3237 PROLOGUE_4_ARGS
3238 sub xSP, 20h
3239
3240 fninit
3241 fld tword [A2]
3242 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3243 %1 word [A3]
3244
3245 fnstsw word [A1 + IEMFPURESULT.FSW]
3246 fnclex
3247 fstp tword [A1 + IEMFPURESULT.r80Result]
3248
3249 fninit
3250 add xSP, 20h
3251 EPILOGUE_4_ARGS
3252ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3253%endmacro
3254
3255IEMIMPL_FPU_R80_BY_I16 fiadd
3256IEMIMPL_FPU_R80_BY_I16 fimul
3257IEMIMPL_FPU_R80_BY_I16 fisub
3258IEMIMPL_FPU_R80_BY_I16 fisubr
3259IEMIMPL_FPU_R80_BY_I16 fidiv
3260IEMIMPL_FPU_R80_BY_I16 fidivr
3261
3262
3263;;
3264; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3265; only returning FSW.
3266;
3267; @param 1 The instruction
3268;
3269; @param A0 FPU context (fxsave).
3270; @param A1 Where to store the output FSW.
3271; @param A2 Pointer to the 80-bit value.
3272; @param A3 Pointer to the 64-bit value.
3273;
3274%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3275BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3276 PROLOGUE_4_ARGS
3277 sub xSP, 20h
3278
3279 fninit
3280 fld tword [A2]
3281 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3282 %1 word [A3]
3283
3284 fnstsw word [A1]
3285
3286 fninit
3287 add xSP, 20h
3288 EPILOGUE_4_ARGS
3289ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3290%endmacro
3291
3292IEMIMPL_FPU_R80_BY_I16_FSW ficom
3293
3294
3295
3296;
3297;---------------------- 32-bit signed integer operations ----------------------
3298;
3299
3300
3301;;
3302; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3303;
3304; @param A0 FPU context (fxsave).
3305; @param A1 Pointer to a IEMFPURESULT for the output.
3306; @param A2 Pointer to the 32-bit floating point value to convert.
3307;
3308BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3309 PROLOGUE_3_ARGS
3310 sub xSP, 20h
3311
3312 fninit
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 fild dword [A2]
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_3_ARGS
3323ENDPROC iemAImpl_fild_r80_from_i32
3324
3325
3326;;
3327; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3328;
3329; @param A0 FPU context (fxsave).
3330; @param A1 Where to return the output FSW.
3331; @param A2 Where to store the 32-bit signed integer value.
3332; @param A3 Pointer to the 80-bit value.
3333;
3334BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3335 PROLOGUE_4_ARGS
3336 sub xSP, 20h
3337
3338 fninit
3339 fld tword [A3]
3340 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3341 fistp dword [A2]
3342
3343 fnstsw word [A1]
3344
3345 fninit
3346 add xSP, 20h
3347 EPILOGUE_4_ARGS
3348ENDPROC iemAImpl_fist_r80_to_i32
3349
3350
3351;;
3352; Store a 80-bit floating point value (register) as a 32-bit signed integer
3353; (memory) with truncation.
3354;
3355; @param A0 FPU context (fxsave).
3356; @param A1 Where to return the output FSW.
3357; @param A2 Where to store the 32-bit signed integer value.
3358; @param A3 Pointer to the 80-bit value.
3359;
3360BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3361 PROLOGUE_4_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 fld tword [A3]
3366 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3367 fisttp dword [A2]
3368
3369 fnstsw word [A1]
3370
3371 fninit
3372 add xSP, 20h
3373 EPILOGUE_4_ARGS
3374ENDPROC iemAImpl_fistt_r80_to_i32
3375
3376
3377;;
3378; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3379;
3380; @param 1 The instruction
3381;
3382; @param A0 FPU context (fxsave).
3383; @param A1 Pointer to a IEMFPURESULT for the output.
3384; @param A2 Pointer to the 80-bit value.
3385; @param A3 Pointer to the 32-bit value.
3386;
3387%macro IEMIMPL_FPU_R80_BY_I32 1
3388BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3389 PROLOGUE_4_ARGS
3390 sub xSP, 20h
3391
3392 fninit
3393 fld tword [A2]
3394 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3395 %1 dword [A3]
3396
3397 fnstsw word [A1 + IEMFPURESULT.FSW]
3398 fnclex
3399 fstp tword [A1 + IEMFPURESULT.r80Result]
3400
3401 fninit
3402 add xSP, 20h
3403 EPILOGUE_4_ARGS
3404ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3405%endmacro
3406
3407IEMIMPL_FPU_R80_BY_I32 fiadd
3408IEMIMPL_FPU_R80_BY_I32 fimul
3409IEMIMPL_FPU_R80_BY_I32 fisub
3410IEMIMPL_FPU_R80_BY_I32 fisubr
3411IEMIMPL_FPU_R80_BY_I32 fidiv
3412IEMIMPL_FPU_R80_BY_I32 fidivr
3413
3414
3415;;
3416; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3417; only returning FSW.
3418;
3419; @param 1 The instruction
3420;
3421; @param A0 FPU context (fxsave).
3422; @param A1 Where to store the output FSW.
3423; @param A2 Pointer to the 80-bit value.
3424; @param A3 Pointer to the 64-bit value.
3425;
3426%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3427BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3428 PROLOGUE_4_ARGS
3429 sub xSP, 20h
3430
3431 fninit
3432 fld tword [A2]
3433 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3434 %1 dword [A3]
3435
3436 fnstsw word [A1]
3437
3438 fninit
3439 add xSP, 20h
3440 EPILOGUE_4_ARGS
3441ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3442%endmacro
3443
3444IEMIMPL_FPU_R80_BY_I32_FSW ficom
3445
3446
3447
3448;
3449;---------------------- 64-bit signed integer operations ----------------------
3450;
3451
3452
3453;;
3454; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3455;
3456; @param A0 FPU context (fxsave).
3457; @param A1 Pointer to a IEMFPURESULT for the output.
3458; @param A2 Pointer to the 64-bit floating point value to convert.
3459;
3460BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3461 PROLOGUE_3_ARGS
3462 sub xSP, 20h
3463
3464 fninit
3465 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3466 fild qword [A2]
3467
3468 fnstsw word [A1 + IEMFPURESULT.FSW]
3469 fnclex
3470 fstp tword [A1 + IEMFPURESULT.r80Result]
3471
3472 fninit
3473 add xSP, 20h
3474 EPILOGUE_3_ARGS
3475ENDPROC iemAImpl_fild_r80_from_i64
3476
3477
3478;;
3479; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3480;
3481; @param A0 FPU context (fxsave).
3482; @param A1 Where to return the output FSW.
3483; @param A2 Where to store the 64-bit signed integer value.
3484; @param A3 Pointer to the 80-bit value.
3485;
3486BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3487 PROLOGUE_4_ARGS
3488 sub xSP, 20h
3489
3490 fninit
3491 fld tword [A3]
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 fistp qword [A2]
3494
3495 fnstsw word [A1]
3496
3497 fninit
3498 add xSP, 20h
3499 EPILOGUE_4_ARGS
3500ENDPROC iemAImpl_fist_r80_to_i64
3501
3502
3503;;
3504; Store a 80-bit floating point value (register) as a 64-bit signed integer
3505; (memory) with truncation.
3506;
3507; @param A0 FPU context (fxsave).
3508; @param A1 Where to return the output FSW.
3509; @param A2 Where to store the 64-bit signed integer value.
3510; @param A3 Pointer to the 80-bit value.
3511;
3512BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3513 PROLOGUE_4_ARGS
3514 sub xSP, 20h
3515
3516 fninit
3517 fld tword [A3]
3518 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3519 fisttp qword [A2]
3520
3521 fnstsw word [A1]
3522
3523 fninit
3524 add xSP, 20h
3525 EPILOGUE_4_ARGS
3526ENDPROC iemAImpl_fistt_r80_to_i64
3527
3528
3529
3530;
3531;---------------------- 32-bit floating point operations ----------------------
3532;
3533
3534;;
3535; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3536;
3537; @param A0 FPU context (fxsave).
3538; @param A1 Pointer to a IEMFPURESULT for the output.
3539; @param A2 Pointer to the 32-bit floating point value to convert.
3540;
3541BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3542 PROLOGUE_3_ARGS
3543 sub xSP, 20h
3544
3545 fninit
3546 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3547 fld dword [A2]
3548
3549 fnstsw word [A1 + IEMFPURESULT.FSW]
3550 fnclex
3551 fstp tword [A1 + IEMFPURESULT.r80Result]
3552
3553 fninit
3554 add xSP, 20h
3555 EPILOGUE_3_ARGS
3556ENDPROC iemAImpl_fld_r80_from_r32
3557
3558
3559;;
3560; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3561;
3562; @param A0 FPU context (fxsave).
3563; @param A1 Where to return the output FSW.
3564; @param A2 Where to store the 32-bit value.
3565; @param A3 Pointer to the 80-bit value.
3566;
3567BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3568 PROLOGUE_4_ARGS
3569 sub xSP, 20h
3570
3571 fninit
3572 fld tword [A3]
3573 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3574 fst dword [A2]
3575
3576 fnstsw word [A1]
3577
3578 fninit
3579 add xSP, 20h
3580 EPILOGUE_4_ARGS
3581ENDPROC iemAImpl_fst_r80_to_r32
3582
3583
3584;;
3585; FPU instruction working on one 80-bit and one 32-bit floating point value.
3586;
3587; @param 1 The instruction
3588;
3589; @param A0 FPU context (fxsave).
3590; @param A1 Pointer to a IEMFPURESULT for the output.
3591; @param A2 Pointer to the 80-bit value.
3592; @param A3 Pointer to the 32-bit value.
3593;
3594%macro IEMIMPL_FPU_R80_BY_R32 1
3595BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3596 PROLOGUE_4_ARGS
3597 sub xSP, 20h
3598
3599 fninit
3600 fld tword [A2]
3601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3602 %1 dword [A3]
3603
3604 fnstsw word [A1 + IEMFPURESULT.FSW]
3605 fnclex
3606 fstp tword [A1 + IEMFPURESULT.r80Result]
3607
3608 fninit
3609 add xSP, 20h
3610 EPILOGUE_4_ARGS
3611ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3612%endmacro
3613
3614IEMIMPL_FPU_R80_BY_R32 fadd
3615IEMIMPL_FPU_R80_BY_R32 fmul
3616IEMIMPL_FPU_R80_BY_R32 fsub
3617IEMIMPL_FPU_R80_BY_R32 fsubr
3618IEMIMPL_FPU_R80_BY_R32 fdiv
3619IEMIMPL_FPU_R80_BY_R32 fdivr
3620
3621
3622;;
3623; FPU instruction working on one 80-bit and one 32-bit floating point value,
3624; only returning FSW.
3625;
3626; @param 1 The instruction
3627;
3628; @param A0 FPU context (fxsave).
3629; @param A1 Where to store the output FSW.
3630; @param A2 Pointer to the 80-bit value.
3631; @param A3 Pointer to the 64-bit value.
3632;
3633%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3635 PROLOGUE_4_ARGS
3636 sub xSP, 20h
3637
3638 fninit
3639 fld tword [A2]
3640 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3641 %1 dword [A3]
3642
3643 fnstsw word [A1]
3644
3645 fninit
3646 add xSP, 20h
3647 EPILOGUE_4_ARGS
3648ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3649%endmacro
3650
3651IEMIMPL_FPU_R80_BY_R32_FSW fcom
3652
3653
3654
3655;
3656;---------------------- 64-bit floating point operations ----------------------
3657;
3658
3659;;
3660; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3661;
3662; @param A0 FPU context (fxsave).
3663; @param A1 Pointer to a IEMFPURESULT for the output.
3664; @param A2 Pointer to the 64-bit floating point value to convert.
3665;
3666BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3667 PROLOGUE_3_ARGS
3668 sub xSP, 20h
3669
3670 fninit
3671 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3672 fld qword [A2]
3673
3674 fnstsw word [A1 + IEMFPURESULT.FSW]
3675 fnclex
3676 fstp tword [A1 + IEMFPURESULT.r80Result]
3677
3678 fninit
3679 add xSP, 20h
3680 EPILOGUE_3_ARGS
3681ENDPROC iemAImpl_fld_r80_from_r64
3682
3683
3684;;
3685; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3686;
3687; @param A0 FPU context (fxsave).
3688; @param A1 Where to return the output FSW.
3689; @param A2 Where to store the 64-bit value.
3690; @param A3 Pointer to the 80-bit value.
3691;
3692BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3693 PROLOGUE_4_ARGS
3694 sub xSP, 20h
3695
3696 fninit
3697 fld tword [A3]
3698 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3699 fst qword [A2]
3700
3701 fnstsw word [A1]
3702
3703 fninit
3704 add xSP, 20h
3705 EPILOGUE_4_ARGS
3706ENDPROC iemAImpl_fst_r80_to_r64
3707
3708
3709;;
3710; FPU instruction working on one 80-bit and one 64-bit floating point value.
3711;
3712; @param 1 The instruction
3713;
3714; @param A0 FPU context (fxsave).
3715; @param A1 Pointer to a IEMFPURESULT for the output.
3716; @param A2 Pointer to the 80-bit value.
3717; @param A3 Pointer to the 64-bit value.
3718;
3719%macro IEMIMPL_FPU_R80_BY_R64 1
3720BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3721 PROLOGUE_4_ARGS
3722 sub xSP, 20h
3723
3724 fninit
3725 fld tword [A2]
3726 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3727 %1 qword [A3]
3728
3729 fnstsw word [A1 + IEMFPURESULT.FSW]
3730 fnclex
3731 fstp tword [A1 + IEMFPURESULT.r80Result]
3732
3733 fninit
3734 add xSP, 20h
3735 EPILOGUE_4_ARGS
3736ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3737%endmacro
3738
3739IEMIMPL_FPU_R80_BY_R64 fadd
3740IEMIMPL_FPU_R80_BY_R64 fmul
3741IEMIMPL_FPU_R80_BY_R64 fsub
3742IEMIMPL_FPU_R80_BY_R64 fsubr
3743IEMIMPL_FPU_R80_BY_R64 fdiv
3744IEMIMPL_FPU_R80_BY_R64 fdivr
3745
3746;;
3747; FPU instruction working on one 80-bit and one 64-bit floating point value,
3748; only returning FSW.
3749;
3750; @param 1 The instruction
3751;
3752; @param A0 FPU context (fxsave).
3753; @param A1 Where to store the output FSW.
3754; @param A2 Pointer to the 80-bit value.
3755; @param A3 Pointer to the 64-bit value.
3756;
3757%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3758BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3759 PROLOGUE_4_ARGS
3760 sub xSP, 20h
3761
3762 fninit
3763 fld tword [A2]
3764 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3765 %1 qword [A3]
3766
3767 fnstsw word [A1]
3768
3769 fninit
3770 add xSP, 20h
3771 EPILOGUE_4_ARGS
3772ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3773%endmacro
3774
3775IEMIMPL_FPU_R80_BY_R64_FSW fcom
3776
3777
3778
3779;
3780;---------------------- 80-bit floating point operations ----------------------
3781;
3782
3783;;
3784; Loads a 80-bit floating point register value from memory.
3785;
3786; @param A0 FPU context (fxsave).
3787; @param A1 Pointer to a IEMFPURESULT for the output.
3788; @param A2 Pointer to the 80-bit floating point value to load.
3789;
3790BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3791 PROLOGUE_3_ARGS
3792 sub xSP, 20h
3793
3794 fninit
3795 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3796 fld tword [A2]
3797
3798 fnstsw word [A1 + IEMFPURESULT.FSW]
3799 fnclex
3800 fstp tword [A1 + IEMFPURESULT.r80Result]
3801
3802 fninit
3803 add xSP, 20h
3804 EPILOGUE_3_ARGS
3805ENDPROC iemAImpl_fld_r80_from_r80
3806
3807
3808;;
3809; Store a 80-bit floating point register to memory
3810;
3811; @param A0 FPU context (fxsave).
3812; @param A1 Where to return the output FSW.
3813; @param A2 Where to store the 80-bit value.
3814; @param A3 Pointer to the 80-bit register value.
3815;
3816BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3817 PROLOGUE_4_ARGS
3818 sub xSP, 20h
3819
3820 fninit
3821 fld tword [A3]
3822 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3823 fstp tword [A2]
3824
3825 fnstsw word [A1]
3826
3827 fninit
3828 add xSP, 20h
3829 EPILOGUE_4_ARGS
3830ENDPROC iemAImpl_fst_r80_to_r80
3831
3832
3833;;
3834; Loads an 80-bit floating point register value in BCD format from memory.
3835;
3836; @param A0 FPU context (fxsave).
3837; @param A1 Pointer to a IEMFPURESULT for the output.
3838; @param A2 Pointer to the 80-bit BCD value to load.
3839;
3840BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3841 PROLOGUE_3_ARGS
3842 sub xSP, 20h
3843
3844 fninit
3845 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3846 fbld tword [A2]
3847
3848 fnstsw word [A1 + IEMFPURESULT.FSW]
3849 fnclex
3850 fstp tword [A1 + IEMFPURESULT.r80Result]
3851
3852 fninit
3853 add xSP, 20h
3854 EPILOGUE_3_ARGS
3855ENDPROC iemAImpl_fld_r80_from_d80
3856
3857
3858;;
3859; Store a 80-bit floating point register to memory as BCD
3860;
3861; @param A0 FPU context (fxsave).
3862; @param A1 Where to return the output FSW.
3863; @param A2 Where to store the 80-bit BCD value.
3864; @param A3 Pointer to the 80-bit register value.
3865;
3866BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3867 PROLOGUE_4_ARGS
3868 sub xSP, 20h
3869
3870 fninit
3871 fld tword [A3]
3872 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3873 fbstp tword [A2]
3874
3875 fnstsw word [A1]
3876
3877 fninit
3878 add xSP, 20h
3879 EPILOGUE_4_ARGS
3880ENDPROC iemAImpl_fst_r80_to_d80
3881
3882
3883;;
3884; FPU instruction working on two 80-bit floating point values.
3885;
3886; @param 1 The instruction
3887;
3888; @param A0 FPU context (fxsave).
3889; @param A1 Pointer to a IEMFPURESULT for the output.
3890; @param A2 Pointer to the first 80-bit value (ST0)
3891; @param A3 Pointer to the second 80-bit value (STn).
3892;
3893%macro IEMIMPL_FPU_R80_BY_R80 2
3894BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3895 PROLOGUE_4_ARGS
3896 sub xSP, 20h
3897
3898 fninit
3899 fld tword [A3]
3900 fld tword [A2]
3901 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3902 %1 %2
3903
3904 fnstsw word [A1 + IEMFPURESULT.FSW]
3905 fnclex
3906 fstp tword [A1 + IEMFPURESULT.r80Result]
3907
3908 fninit
3909 add xSP, 20h
3910 EPILOGUE_4_ARGS
3911ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3912%endmacro
3913
3914IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3915IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3916IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3917IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3918IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3919IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3920IEMIMPL_FPU_R80_BY_R80 fprem, {}
3921IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3922IEMIMPL_FPU_R80_BY_R80 fscale, {}
3923
3924
3925;;
3926; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3927; storing the result in ST1 and popping the stack.
3928;
3929; @param 1 The instruction
3930;
3931; @param A0 FPU context (fxsave).
3932; @param A1 Pointer to a IEMFPURESULT for the output.
3933; @param A2 Pointer to the first 80-bit value (ST1).
3934; @param A3 Pointer to the second 80-bit value (ST0).
3935;
3936%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3937BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3938 PROLOGUE_4_ARGS
3939 sub xSP, 20h
3940
3941 fninit
3942 fld tword [A2]
3943 fld tword [A3]
3944 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3945 %1
3946
3947 fnstsw word [A1 + IEMFPURESULT.FSW]
3948 fnclex
3949 fstp tword [A1 + IEMFPURESULT.r80Result]
3950
3951 fninit
3952 add xSP, 20h
3953 EPILOGUE_4_ARGS
3954ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3955%endmacro
3956
3957IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3958IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3959IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3960
3961
3962;;
3963; FPU instruction working on two 80-bit floating point values, only
3964; returning FSW.
3965;
3966; @param 1 The instruction
3967;
3968; @param A0 FPU context (fxsave).
3969; @param A1 Pointer to a uint16_t for the resulting FSW.
3970; @param A2 Pointer to the first 80-bit value.
3971; @param A3 Pointer to the second 80-bit value.
3972;
3973%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3974BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3975 PROLOGUE_4_ARGS
3976 sub xSP, 20h
3977
3978 fninit
3979 fld tword [A3]
3980 fld tword [A2]
3981 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3982 %1 st0, st1
3983
3984 fnstsw word [A1]
3985
3986 fninit
3987 add xSP, 20h
3988 EPILOGUE_4_ARGS
3989ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3990%endmacro
3991
3992IEMIMPL_FPU_R80_BY_R80_FSW fcom
3993IEMIMPL_FPU_R80_BY_R80_FSW fucom
3994
3995
3996;;
3997; FPU instruction working on two 80-bit floating point values,
3998; returning FSW and EFLAGS (eax).
3999;
4000; @param 1 The instruction
4001;
4002; @returns EFLAGS in EAX.
4003; @param A0 FPU context (fxsave).
4004; @param A1 Pointer to a uint16_t for the resulting FSW.
4005; @param A2 Pointer to the first 80-bit value.
4006; @param A3 Pointer to the second 80-bit value.
4007;
4008%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4009BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4010 PROLOGUE_4_ARGS
4011 sub xSP, 20h
4012
4013 fninit
4014 fld tword [A3]
4015 fld tword [A2]
4016 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4017 %1 st1
4018
4019 fnstsw word [A1]
4020 pushf
4021 pop xAX
4022
4023 fninit
4024 add xSP, 20h
4025 EPILOGUE_4_ARGS
4026ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4027%endmacro
4028
4029IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4030IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4031
4032
4033;;
4034; FPU instruction working on one 80-bit floating point value.
4035;
4036; @param 1 The instruction
4037;
4038; @param A0 FPU context (fxsave).
4039; @param A1 Pointer to a IEMFPURESULT for the output.
4040; @param A2 Pointer to the 80-bit value.
4041;
4042%macro IEMIMPL_FPU_R80 1
4043BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4044 PROLOGUE_3_ARGS
4045 sub xSP, 20h
4046
4047 fninit
4048 fld tword [A2]
4049 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4050 %1
4051
4052 fnstsw word [A1 + IEMFPURESULT.FSW]
4053 fnclex
4054 fstp tword [A1 + IEMFPURESULT.r80Result]
4055
4056 fninit
4057 add xSP, 20h
4058 EPILOGUE_3_ARGS
4059ENDPROC iemAImpl_ %+ %1 %+ _r80
4060%endmacro
4061
4062IEMIMPL_FPU_R80 fchs
4063IEMIMPL_FPU_R80 fabs
4064IEMIMPL_FPU_R80 f2xm1
4065IEMIMPL_FPU_R80 fsqrt
4066IEMIMPL_FPU_R80 frndint
4067IEMIMPL_FPU_R80 fsin
4068IEMIMPL_FPU_R80 fcos
4069
4070
4071;;
4072; FPU instruction working on one 80-bit floating point value, only
4073; returning FSW.
4074;
4075; @param 1 The instruction
4076; @param 2 Non-zero to also restore FTW.
4077;
4078; @param A0 FPU context (fxsave).
4079; @param A1 Pointer to a uint16_t for the resulting FSW.
4080; @param A2 Pointer to the 80-bit value.
4081;
4082%macro IEMIMPL_FPU_R80_FSW 2
4083BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4084 PROLOGUE_3_ARGS
4085 sub xSP, 20h
4086
4087 fninit
4088 fld tword [A2]
4089%if %2 != 0
4090 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4091%else
4092 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4093%endif
4094 %1
4095
4096 fnstsw word [A1]
4097
4098 fninit
4099 add xSP, 20h
4100 EPILOGUE_3_ARGS
4101ENDPROC iemAImpl_ %+ %1 %+ _r80
4102%endmacro
4103
4104IEMIMPL_FPU_R80_FSW ftst, 0
4105IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4106
4107
4108
4109;;
4110; FPU instruction loading a 80-bit floating point constant.
4111;
4112; @param 1 The instruction
4113;
4114; @param A0 FPU context (fxsave).
4115; @param A1 Pointer to a IEMFPURESULT for the output.
4116;
4117%macro IEMIMPL_FPU_R80_CONST 1
4118BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4119 PROLOGUE_2_ARGS
4120 sub xSP, 20h
4121
4122 fninit
4123 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4124 %1
4125
4126 fnstsw word [A1 + IEMFPURESULT.FSW]
4127 fnclex
4128 fstp tword [A1 + IEMFPURESULT.r80Result]
4129
4130 fninit
4131 add xSP, 20h
4132 EPILOGUE_2_ARGS
4133ENDPROC iemAImpl_ %+ %1 %+
4134%endmacro
4135
4136IEMIMPL_FPU_R80_CONST fld1
4137IEMIMPL_FPU_R80_CONST fldl2t
4138IEMIMPL_FPU_R80_CONST fldl2e
4139IEMIMPL_FPU_R80_CONST fldpi
4140IEMIMPL_FPU_R80_CONST fldlg2
4141IEMIMPL_FPU_R80_CONST fldln2
4142IEMIMPL_FPU_R80_CONST fldz
4143
4144
4145;;
4146; FPU instruction working on one 80-bit floating point value, outputing two.
4147;
4148; @param 1 The instruction
4149;
4150; @param A0 FPU context (fxsave).
4151; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4152; @param A2 Pointer to the 80-bit value.
4153;
4154%macro IEMIMPL_FPU_R80_R80 1
4155BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4156 PROLOGUE_3_ARGS
4157 sub xSP, 20h
4158
4159 fninit
4160 fld tword [A2]
4161 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4162 %1
4163
4164 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4165 fnclex
4166 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4167 fnclex
4168 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4169
4170 fninit
4171 add xSP, 20h
4172 EPILOGUE_3_ARGS
4173ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4174%endmacro
4175
4176IEMIMPL_FPU_R80_R80 fptan
4177IEMIMPL_FPU_R80_R80 fxtract
4178IEMIMPL_FPU_R80_R80 fsincos
4179
4180
4181
4182
4183;---------------------- SSE and MMX Operations ----------------------
4184
4185;; @todo what do we need to do for MMX?
4186%macro IEMIMPL_MMX_PROLOGUE 0
4187%endmacro
4188%macro IEMIMPL_MMX_EPILOGUE 0
4189%endmacro
4190
4191;; @todo what do we need to do for SSE?
4192%macro IEMIMPL_SSE_PROLOGUE 0
4193%endmacro
4194%macro IEMIMPL_SSE_EPILOGUE 0
4195%endmacro
4196
4197;; @todo what do we need to do for AVX?
4198%macro IEMIMPL_AVX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_AVX_EPILOGUE 0
4201%endmacro
4202
4203
4204;;
4205; Media instruction working on two full sized registers.
4206;
4207; @param 1 The instruction
4208; @param 2 Whether there is an MMX variant (1) or not (0).
4209;
4210; @param A0 FPU context (fxsave).
4211; @param A1 Pointer to the first media register size operand (input/output).
4212; @param A2 Pointer to the second media register size operand (input).
4213;
4214; @todo r=aeichner Currently unused, can probably be removed.
4215;
4216%macro IEMIMPL_MEDIA_F2 2
4217%if %2 != 0
4218BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4219 PROLOGUE_3_ARGS
4220 IEMIMPL_MMX_PROLOGUE
4221
4222 movq mm0, [A1]
4223 movq mm1, [A2]
4224 %1 mm0, mm1
4225 movq [A1], mm0
4226
4227 IEMIMPL_MMX_EPILOGUE
4228 EPILOGUE_3_ARGS
4229ENDPROC iemAImpl_ %+ %1 %+ _u64
4230%endif
4231
4232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4233 PROLOGUE_3_ARGS
4234 IEMIMPL_SSE_PROLOGUE
4235
4236 movdqu xmm0, [A1]
4237 movdqu xmm1, [A2]
4238 %1 xmm0, xmm1
4239 movdqu [A1], xmm0
4240
4241 IEMIMPL_SSE_EPILOGUE
4242 EPILOGUE_3_ARGS
4243ENDPROC iemAImpl_ %+ %1 %+ _u128
4244%endmacro
4245
4246;;
4247; Media instruction working on two full sized registers, but no FXSAVE state argument.
4248;
4249; @param 1 The instruction
4250; @param 2 Whether there is an MMX variant (1) or not (0).
4251;
4252; @param A0 Pointer to the first media register size operand (input/output).
4253; @param A1 Pointer to the second media register size operand (input).
4254;
4255%macro IEMIMPL_MEDIA_OPT_F2 2
4256%if %2 != 0
4257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4258 PROLOGUE_2_ARGS
4259 IEMIMPL_MMX_PROLOGUE
4260
4261 movq mm0, [A0]
4262 movq mm1, [A1]
4263 %1 mm0, mm1
4264 movq [A0], mm0
4265
4266 IEMIMPL_MMX_EPILOGUE
4267 EPILOGUE_2_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u64
4269%endif
4270
4271BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4272 PROLOGUE_2_ARGS
4273 IEMIMPL_SSE_PROLOGUE
4274
4275 movdqu xmm0, [A0]
4276 movdqu xmm1, [A1]
4277 %1 xmm0, xmm1
4278 movdqu [A0], xmm0
4279
4280 IEMIMPL_SSE_EPILOGUE
4281 EPILOGUE_2_ARGS
4282ENDPROC iemAImpl_ %+ %1 %+ _u128
4283%endmacro
4284
4285IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4286IEMIMPL_MEDIA_OPT_F2 pand, 1
4287IEMIMPL_MEDIA_OPT_F2 pandn, 1
4288IEMIMPL_MEDIA_OPT_F2 por, 1
4289IEMIMPL_MEDIA_OPT_F2 pxor, 1
4290IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4291IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4292IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4293IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4294IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4295IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4296IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4297IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4298IEMIMPL_MEDIA_OPT_F2 paddb, 1
4299IEMIMPL_MEDIA_OPT_F2 paddw, 1
4300IEMIMPL_MEDIA_OPT_F2 paddd, 1
4301IEMIMPL_MEDIA_OPT_F2 paddq, 1
4302IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4303IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4304IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4305IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4306IEMIMPL_MEDIA_OPT_F2 psubb, 1
4307IEMIMPL_MEDIA_OPT_F2 psubw, 1
4308IEMIMPL_MEDIA_OPT_F2 psubd, 1
4309IEMIMPL_MEDIA_OPT_F2 psubq, 1
4310IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4311IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4312IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4313IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4314IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4315IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4316IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4317IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4318IEMIMPL_MEDIA_OPT_F2 pminub, 1
4319IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4320IEMIMPL_MEDIA_OPT_F2 pminud, 0
4321IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4322IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4323IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4324IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4325IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4326IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4327IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4328IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4330IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4331IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4332IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4333IEMIMPL_MEDIA_OPT_F2 psignb, 1
4334IEMIMPL_MEDIA_OPT_F2 psignw, 1
4335IEMIMPL_MEDIA_OPT_F2 psignd, 1
4336IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4337IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4338IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4339IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4340IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4341IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4342IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4343IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4345IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4346IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4347IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4348IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4349IEMIMPL_MEDIA_OPT_F2 psllw, 1
4350IEMIMPL_MEDIA_OPT_F2 pslld, 1
4351IEMIMPL_MEDIA_OPT_F2 psllq, 1
4352IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4353IEMIMPL_MEDIA_OPT_F2 psrld, 1
4354IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4355IEMIMPL_MEDIA_OPT_F2 psraw, 1
4356IEMIMPL_MEDIA_OPT_F2 psrad, 1
4357IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4358IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4359IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4360IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4361IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4362IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4363IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4364IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4365IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4366IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4367IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4368IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4369IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4370IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4371IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4372IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4373IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4374IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4375IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4376IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4377
4378;;
4379; Media instruction working on one full sized and one half sized register (lower half).
4380;
4381; @param 1 The instruction
4382; @param 2 1 if MMX is included, 0 if not.
4383;
4384; @param A0 Pointer to the first full sized media register operand (input/output).
4385; @param A1 Pointer to the second half sized media register operand (input).
4386;
4387%macro IEMIMPL_MEDIA_F1L1 2
4388 %if %2 != 0
4389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4390 PROLOGUE_2_ARGS
4391 IEMIMPL_MMX_PROLOGUE
4392
4393 movq mm0, [A0]
4394 movq mm1, [A1]
4395 %1 mm0, mm1
4396 movq [A0], mm0
4397
4398 IEMIMPL_MMX_EPILOGUE
4399 EPILOGUE_2_ARGS
4400ENDPROC iemAImpl_ %+ %1 %+ _u64
4401 %endif
4402
4403BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4404 PROLOGUE_2_ARGS
4405 IEMIMPL_SSE_PROLOGUE
4406
4407 movdqu xmm0, [A0]
4408 movdqu xmm1, [A1]
4409 %1 xmm0, xmm1
4410 movdqu [A0], xmm0
4411
4412 IEMIMPL_SSE_EPILOGUE
4413 EPILOGUE_2_ARGS
4414ENDPROC iemAImpl_ %+ %1 %+ _u128
4415%endmacro
4416
4417IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4418IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4419IEMIMPL_MEDIA_F1L1 punpckldq, 1
4420IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4421
4422
4423;;
4424; Media instruction working two half sized input registers (lower half) and a full sized
4425; destination register (vpunpckh*).
4426;
4427; @param 1 The instruction
4428;
4429; @param A0 Pointer to the destination register (full sized, output only).
4430; @param A1 Pointer to the first full sized media source register operand, where we
4431; will only use the lower half as input - but we'll be loading it in full.
4432; @param A2 Pointer to the second full sized media source register operand, where we
4433; will only use the lower half as input - but we'll be loading it in full.
4434;
4435%macro IEMIMPL_MEDIA_F1L1L1 1
4436BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4437 PROLOGUE_3_ARGS
4438 IEMIMPL_AVX_PROLOGUE
4439
4440 vmovdqu xmm0, [A1]
4441 vmovdqu xmm1, [A2]
4442 %1 xmm0, xmm0, xmm1
4443 vmovdqu [A0], xmm0
4444
4445 IEMIMPL_AVX_PROLOGUE
4446 EPILOGUE_3_ARGS
4447ENDPROC iemAImpl_ %+ %1 %+ _u128
4448
4449BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4450 PROLOGUE_3_ARGS
4451 IEMIMPL_AVX_PROLOGUE
4452
4453 vmovdqu ymm0, [A1]
4454 vmovdqu ymm1, [A2]
4455 %1 ymm0, ymm0, ymm1
4456 vmovdqu [A0], ymm0
4457
4458 IEMIMPL_AVX_PROLOGUE
4459 EPILOGUE_3_ARGS
4460ENDPROC iemAImpl_ %+ %1 %+ _u256
4461%endmacro
4462
4463IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4464IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4465IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4466IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4467
4468
4469;;
4470; Media instruction working on one full sized and one half sized register (high half).
4471;
4472; @param 1 The instruction
4473; @param 2 1 if MMX is included, 0 if not.
4474;
4475; @param A0 Pointer to the first full sized media register operand (input/output).
4476; @param A1 Pointer to the second full sized media register operand, where we
4477; will only use the upper half as input - but we'll load it in full.
4478;
4479%macro IEMIMPL_MEDIA_F1H1 2
4480IEMIMPL_MEDIA_F1L1 %1, %2
4481%endmacro
4482
4483IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4484IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4485IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4486IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4487
4488
4489;;
4490; Media instruction working two half sized input registers (high half) and a full sized
4491; destination register (vpunpckh*).
4492;
4493; @param 1 The instruction
4494;
4495; @param A0 Pointer to the destination register (full sized, output only).
4496; @param A1 Pointer to the first full sized media source register operand, where we
4497; will only use the upper half as input - but we'll be loading it in full.
4498; @param A2 Pointer to the second full sized media source register operand, where we
4499; will only use the upper half as input - but we'll be loading it in full.
4500;
4501%macro IEMIMPL_MEDIA_F1H1H1 1
4502IEMIMPL_MEDIA_F1L1L1 %1
4503%endmacro
4504
4505IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4506IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4507IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4508IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4509
4510
4511;
4512; Shufflers with evil 8-bit immediates.
4513;
4514
4515BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4516 PROLOGUE_3_ARGS
4517 IEMIMPL_MMX_PROLOGUE
4518
4519 movzx A2, A2_8 ; must clear top bits
4520 movq mm1, [A1]
4521 movq mm0, mm0 ; paranoia!
4522 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4523 movq [A0], mm0
4524
4525 IEMIMPL_MMX_EPILOGUE
4526 EPILOGUE_3_ARGS
4527%assign bImm 0
4528%rep 256
4529.imm %+ bImm:
4530 IBT_ENDBRxx_WITHOUT_NOTRACK
4531 pshufw mm0, mm1, bImm
4532 ret
4533 %assign bImm bImm + 1
4534%endrep
4535.immEnd:
4536ENDPROC iemAImpl_pshufw_u64
4537
4538
4539%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4540BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4541 PROLOGUE_3_ARGS
4542 IEMIMPL_SSE_PROLOGUE
4543
4544 movzx A2, A2_8 ; must clear top bits
4545 movdqu xmm1, [A1]
4546 movdqu xmm0, xmm1 ; paranoia!
4547 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4548 movdqu [A0], xmm0
4549
4550 IEMIMPL_SSE_EPILOGUE
4551 EPILOGUE_3_ARGS
4552
4553 %assign bImm 0
4554 %rep 256
4555.imm %+ bImm:
4556 IBT_ENDBRxx_WITHOUT_NOTRACK
4557 %1 xmm0, xmm1, bImm
4558 ret
4559 %assign bImm bImm + 1
4560 %endrep
4561.immEnd:
4562ENDPROC iemAImpl_ %+ %1 %+ _u128
4563%endmacro
4564
4565IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4566IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4567IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4568
4569
4570%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4571BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4572 PROLOGUE_3_ARGS
4573 IEMIMPL_SSE_PROLOGUE
4574
4575 movzx A2, A2_8 ; must clear top bits
4576 vmovdqu ymm1, [A1]
4577 vmovdqu ymm0, ymm1 ; paranoia!
4578 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4579 vmovdqu [A0], ymm0
4580
4581 IEMIMPL_SSE_EPILOGUE
4582 EPILOGUE_3_ARGS
4583 %assign bImm 0
4584 %rep 256
4585.imm %+ bImm:
4586 IBT_ENDBRxx_WITHOUT_NOTRACK
4587 %1 ymm0, ymm1, bImm
4588 ret
4589 %assign bImm bImm + 1
4590 %endrep
4591.immEnd:
4592ENDPROC iemAImpl_ %+ %1 %+ _u256
4593%endmacro
4594
4595IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4596IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4597IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4598
4599
4600;
4601; Shifts with evil 8-bit immediates.
4602;
4603
4604%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4606 PROLOGUE_2_ARGS
4607 IEMIMPL_MMX_PROLOGUE
4608
4609 movzx A1, A1_8 ; must clear top bits
4610 movq mm0, [A0]
4611 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4612 movq [A0], mm0
4613
4614 IEMIMPL_MMX_EPILOGUE
4615 EPILOGUE_2_ARGS
4616%assign bImm 0
4617%rep 256
4618.imm %+ bImm:
4619 IBT_ENDBRxx_WITHOUT_NOTRACK
4620 %1 mm0, bImm
4621 ret
4622 %assign bImm bImm + 1
4623%endrep
4624.immEnd:
4625ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4626%endmacro
4627
4628IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4629IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4630IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4631IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4632IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4633IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4634IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4635IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4636
4637
4638%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4640 PROLOGUE_2_ARGS
4641 IEMIMPL_SSE_PROLOGUE
4642
4643 movzx A1, A1_8 ; must clear top bits
4644 movdqu xmm0, [A0]
4645 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4646 movdqu [A0], xmm0
4647
4648 IEMIMPL_SSE_EPILOGUE
4649 EPILOGUE_2_ARGS
4650 %assign bImm 0
4651 %rep 256
4652.imm %+ bImm:
4653 IBT_ENDBRxx_WITHOUT_NOTRACK
4654 %1 xmm0, bImm
4655 ret
4656 %assign bImm bImm + 1
4657 %endrep
4658.immEnd:
4659ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4660%endmacro
4661
4662IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4663IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4664IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4665IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4666IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4667IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4668IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4669IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4670IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4671IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4672
4673
4674;
4675; Move byte mask.
4676;
4677
4678BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4679 PROLOGUE_2_ARGS
4680 IEMIMPL_MMX_PROLOGUE
4681
4682 movq mm1, [A1]
4683 pmovmskb T0, mm1
4684 mov [A0], T0
4685%ifdef RT_ARCH_X86
4686 mov dword [A0 + 4], 0
4687%endif
4688 IEMIMPL_MMX_EPILOGUE
4689 EPILOGUE_2_ARGS
4690ENDPROC iemAImpl_pmovmskb_u64
4691
4692BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4693 PROLOGUE_2_ARGS
4694 IEMIMPL_SSE_PROLOGUE
4695
4696 movdqu xmm1, [A1]
4697 pmovmskb T0, xmm1
4698 mov [A0], T0
4699%ifdef RT_ARCH_X86
4700 mov dword [A0 + 4], 0
4701%endif
4702 IEMIMPL_SSE_EPILOGUE
4703 EPILOGUE_2_ARGS
4704ENDPROC iemAImpl_pmovmskb_u128
4705
4706BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4707 PROLOGUE_2_ARGS
4708 IEMIMPL_AVX_PROLOGUE
4709
4710 vmovdqu ymm1, [A1]
4711 vpmovmskb T0, ymm1
4712 mov [A0], T0
4713%ifdef RT_ARCH_X86
4714 mov dword [A0 + 4], 0
4715%endif
4716 IEMIMPL_AVX_EPILOGUE
4717 EPILOGUE_2_ARGS
4718ENDPROC iemAImpl_vpmovmskb_u256
4719
4720
4721;;
4722; Media instruction working on two full sized source registers and one destination (AVX).
4723;
4724; @param 1 The instruction
4725;
4726; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4727; @param A1 Pointer to the destination media register size operand (output).
4728; @param A2 Pointer to the first source media register size operand (input).
4729; @param A3 Pointer to the second source media register size operand (input).
4730;
4731; @todo r=aeichner Not used right now
4732;
4733%macro IEMIMPL_MEDIA_F3 1
4734BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4735 PROLOGUE_4_ARGS
4736 IEMIMPL_AVX_PROLOGUE
4737
4738 vmovdqu xmm0, [A2]
4739 vmovdqu xmm1, [A3]
4740 %1 xmm0, xmm0, xmm1
4741 vmovdqu [A1], xmm0
4742
4743 IEMIMPL_AVX_PROLOGUE
4744 EPILOGUE_4_ARGS
4745ENDPROC iemAImpl_ %+ %1 %+ _u128
4746
4747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4748 PROLOGUE_4_ARGS
4749 IEMIMPL_AVX_PROLOGUE
4750
4751 vmovdqu ymm0, [A2]
4752 vmovdqu ymm1, [A3]
4753 %1 ymm0, ymm0, ymm1
4754 vmovdqu [A1], ymm0
4755
4756 IEMIMPL_AVX_PROLOGUE
4757 EPILOGUE_4_ARGS
4758ENDPROC iemAImpl_ %+ %1 %+ _u256
4759%endmacro
4760
4761;;
4762; Media instruction working on two full sized source registers and one destination (AVX),
4763; but no XSAVE state pointer argument.
4764;
4765; @param 1 The instruction
4766;
4767; @param A0 Pointer to the destination media register size operand (output).
4768; @param A1 Pointer to the first source media register size operand (input).
4769; @param A2 Pointer to the second source media register size operand (input).
4770;
4771%macro IEMIMPL_MEDIA_OPT_F3 1
4772BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4773 PROLOGUE_3_ARGS
4774 IEMIMPL_AVX_PROLOGUE
4775
4776 vmovdqu xmm0, [A1]
4777 vmovdqu xmm1, [A2]
4778 %1 xmm0, xmm0, xmm1
4779 vmovdqu [A0], xmm0
4780
4781 IEMIMPL_AVX_PROLOGUE
4782 EPILOGUE_3_ARGS
4783ENDPROC iemAImpl_ %+ %1 %+ _u128
4784
4785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4786 PROLOGUE_3_ARGS
4787 IEMIMPL_AVX_PROLOGUE
4788
4789 vmovdqu ymm0, [A1]
4790 vmovdqu ymm1, [A2]
4791 %1 ymm0, ymm0, ymm1
4792 vmovdqu [A0], ymm0
4793
4794 IEMIMPL_AVX_PROLOGUE
4795 EPILOGUE_3_ARGS
4796ENDPROC iemAImpl_ %+ %1 %+ _u256
4797%endmacro
4798
4799IEMIMPL_MEDIA_OPT_F3 vpshufb
4800IEMIMPL_MEDIA_OPT_F3 vpand
4801IEMIMPL_MEDIA_OPT_F3 vpminub
4802IEMIMPL_MEDIA_OPT_F3 vpminuw
4803IEMIMPL_MEDIA_OPT_F3 vpminud
4804IEMIMPL_MEDIA_OPT_F3 vpminsb
4805IEMIMPL_MEDIA_OPT_F3 vpminsw
4806IEMIMPL_MEDIA_OPT_F3 vpminsd
4807IEMIMPL_MEDIA_OPT_F3 vpmaxub
4808IEMIMPL_MEDIA_OPT_F3 vpmaxuw
4809IEMIMPL_MEDIA_OPT_F3 vpmaxud
4810IEMIMPL_MEDIA_OPT_F3 vpmaxsb
4811IEMIMPL_MEDIA_OPT_F3 vpmaxsw
4812IEMIMPL_MEDIA_OPT_F3 vpmaxsd
4813IEMIMPL_MEDIA_OPT_F3 vpandn
4814IEMIMPL_MEDIA_OPT_F3 vpor
4815IEMIMPL_MEDIA_OPT_F3 vpxor
4816IEMIMPL_MEDIA_OPT_F3 vpcmpeqb
4817IEMIMPL_MEDIA_OPT_F3 vpcmpeqw
4818IEMIMPL_MEDIA_OPT_F3 vpcmpeqd
4819IEMIMPL_MEDIA_OPT_F3 vpcmpeqq
4820IEMIMPL_MEDIA_OPT_F3 vpcmpgtb
4821IEMIMPL_MEDIA_OPT_F3 vpcmpgtw
4822IEMIMPL_MEDIA_OPT_F3 vpcmpgtd
4823IEMIMPL_MEDIA_OPT_F3 vpcmpgtq
4824IEMIMPL_MEDIA_OPT_F3 vpaddb
4825IEMIMPL_MEDIA_OPT_F3 vpaddw
4826IEMIMPL_MEDIA_OPT_F3 vpaddd
4827IEMIMPL_MEDIA_OPT_F3 vpaddq
4828IEMIMPL_MEDIA_OPT_F3 vpsubb
4829IEMIMPL_MEDIA_OPT_F3 vpsubw
4830IEMIMPL_MEDIA_OPT_F3 vpsubd
4831IEMIMPL_MEDIA_OPT_F3 vpsubq
4832IEMIMPL_MEDIA_OPT_F3 vpacksswb
4833IEMIMPL_MEDIA_OPT_F3 vpackssdw
4834IEMIMPL_MEDIA_OPT_F3 vpackuswb
4835IEMIMPL_MEDIA_OPT_F3 vpackusdw
4836IEMIMPL_MEDIA_OPT_F3 vpmullw
4837IEMIMPL_MEDIA_OPT_F3 vpmulld
4838IEMIMPL_MEDIA_OPT_F3 vpmulhw
4839IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4840IEMIMPL_MEDIA_OPT_F3 vpavgb
4841IEMIMPL_MEDIA_OPT_F3 vpavgw
4842IEMIMPL_MEDIA_OPT_F3 vpsignb
4843IEMIMPL_MEDIA_OPT_F3 vpsignw
4844IEMIMPL_MEDIA_OPT_F3 vpsignd
4845IEMIMPL_MEDIA_OPT_F3 vphaddw
4846IEMIMPL_MEDIA_OPT_F3 vphaddd
4847IEMIMPL_MEDIA_OPT_F3 vphsubw
4848IEMIMPL_MEDIA_OPT_F3 vphsubd
4849IEMIMPL_MEDIA_OPT_F3 vphaddsw
4850IEMIMPL_MEDIA_OPT_F3 vphsubsw
4851IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4852IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4853IEMIMPL_MEDIA_OPT_F3 vpsadbw
4854IEMIMPL_MEDIA_OPT_F3 vpmuldq
4855IEMIMPL_MEDIA_OPT_F3 vpmuludq
4856IEMIMPL_MEDIA_OPT_F3 vunpcklps
4857IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4858IEMIMPL_MEDIA_OPT_F3 vunpckhps
4859IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4860IEMIMPL_MEDIA_OPT_F3 vpsubsb
4861IEMIMPL_MEDIA_OPT_F3 vpsubsw
4862IEMIMPL_MEDIA_OPT_F3 vpsubusb
4863IEMIMPL_MEDIA_OPT_F3 vpsubusw
4864IEMIMPL_MEDIA_OPT_F3 vpaddusb
4865IEMIMPL_MEDIA_OPT_F3 vpaddusw
4866IEMIMPL_MEDIA_OPT_F3 vpaddsb
4867IEMIMPL_MEDIA_OPT_F3 vpaddsw
4868IEMIMPL_MEDIA_OPT_F3 vpermilps
4869IEMIMPL_MEDIA_OPT_F3 vpermilpd
4870IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4871IEMIMPL_MEDIA_OPT_F3 vpsrlvd
4872IEMIMPL_MEDIA_OPT_F3 vpsrlvq
4873IEMIMPL_MEDIA_OPT_F3 vpsravd
4874IEMIMPL_MEDIA_OPT_F3 vpsllvd
4875IEMIMPL_MEDIA_OPT_F3 vpsllvq
4876
4877;;
4878; Media instruction working on one full sized source register, one full sized destination
4879; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4880; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4881; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4882; of either 16, 32, or 64, it acts like the max shift size)
4883;
4884; @param 1 The instruction
4885;
4886; @param A0 Pointer to the destination media register size operand (output).
4887; @param A1 Pointer to the first source media register size operand (input).
4888; @param A2 Pointer to the second source media register size operand (input).
4889;
4890%macro IEMIMPL_SHIFT_OPT_F3 1
4891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4892 PROLOGUE_3_ARGS
4893 IEMIMPL_AVX_PROLOGUE
4894
4895 vmovdqu xmm0, [A1]
4896 vmovdqu xmm1, [A2]
4897 %1 xmm0, xmm0, xmm1
4898 vmovdqu [A0], xmm0
4899
4900 IEMIMPL_AVX_PROLOGUE
4901 EPILOGUE_3_ARGS
4902ENDPROC iemAImpl_ %+ %1 %+ _u128
4903
4904BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4905 PROLOGUE_3_ARGS
4906 IEMIMPL_AVX_PROLOGUE
4907
4908 vmovdqu ymm0, [A1]
4909 vmovdqu xmm1, [A2]
4910 %1 ymm0, ymm0, xmm1
4911 vmovdqu [A0], ymm0
4912
4913 IEMIMPL_AVX_PROLOGUE
4914 EPILOGUE_3_ARGS
4915ENDPROC iemAImpl_ %+ %1 %+ _u256
4916%endmacro
4917
4918IEMIMPL_SHIFT_OPT_F3 vpsllw
4919IEMIMPL_SHIFT_OPT_F3 vpslld
4920IEMIMPL_SHIFT_OPT_F3 vpsllq
4921IEMIMPL_SHIFT_OPT_F3 vpsraw
4922IEMIMPL_SHIFT_OPT_F3 vpsrad
4923IEMIMPL_SHIFT_OPT_F3 vpsrlw
4924IEMIMPL_SHIFT_OPT_F3 vpsrld
4925IEMIMPL_SHIFT_OPT_F3 vpsrlq
4926
4927
4928;;
4929; Media instruction working on one full sized source registers and one destination (AVX),
4930; but no XSAVE state pointer argument.
4931;
4932; @param 1 The instruction
4933; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4934;
4935; @param A0 Pointer to the destination media register size operand (output).
4936; @param A1 Pointer to the source media register size operand (input).
4937;
4938%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4939BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4940 PROLOGUE_2_ARGS
4941 IEMIMPL_AVX_PROLOGUE
4942
4943 vmovdqu xmm0, [A1]
4944 %1 xmm0, xmm0
4945 vmovdqu [A0], xmm0
4946
4947 IEMIMPL_AVX_PROLOGUE
4948 EPILOGUE_2_ARGS
4949ENDPROC iemAImpl_ %+ %1 %+ _u128
4950
4951 %if %2 == 1
4952BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4953 PROLOGUE_2_ARGS
4954 IEMIMPL_AVX_PROLOGUE
4955
4956 vmovdqu ymm0, [A1]
4957 %1 ymm0, ymm0
4958 vmovdqu [A0], ymm0
4959
4960 IEMIMPL_AVX_PROLOGUE
4961 EPILOGUE_2_ARGS
4962ENDPROC iemAImpl_ %+ %1 %+ _u256
4963 %endif
4964%endmacro
4965
4966IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4967IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4968IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4969IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4970
4971
4972;
4973; The SSE 4.2 crc32
4974;
4975; @param A1 Pointer to the 32-bit destination.
4976; @param A2 The source operand, sized according to the suffix.
4977;
4978BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4979 PROLOGUE_2_ARGS
4980
4981 mov T0_32, [A0]
4982 crc32 T0_32, A1_8
4983 mov [A0], T0_32
4984
4985 EPILOGUE_2_ARGS
4986ENDPROC iemAImpl_crc32_u8
4987
4988BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4989 PROLOGUE_2_ARGS
4990
4991 mov T0_32, [A0]
4992 crc32 T0_32, A1_16
4993 mov [A0], T0_32
4994
4995 EPILOGUE_2_ARGS
4996ENDPROC iemAImpl_crc32_u16
4997
4998BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4999 PROLOGUE_2_ARGS
5000
5001 mov T0_32, [A0]
5002 crc32 T0_32, A1_32
5003 mov [A0], T0_32
5004
5005 EPILOGUE_2_ARGS
5006ENDPROC iemAImpl_crc32_u32
5007
5008%ifdef RT_ARCH_AMD64
5009BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5010 PROLOGUE_2_ARGS
5011
5012 mov T0_32, [A0]
5013 crc32 T0, A1
5014 mov [A0], T0_32
5015
5016 EPILOGUE_2_ARGS
5017ENDPROC iemAImpl_crc32_u64
5018%endif
5019
5020
5021;
5022; PTEST (SSE 4.1)
5023;
5024; @param A0 Pointer to the first source operand (aka readonly destination).
5025; @param A1 Pointer to the second source operand.
5026; @param A2 Pointer to the EFLAGS register.
5027;
5028BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5029 PROLOGUE_3_ARGS
5030 IEMIMPL_SSE_PROLOGUE
5031
5032 movdqu xmm0, [A0]
5033 movdqu xmm1, [A1]
5034 ptest xmm0, xmm1
5035 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5036
5037 IEMIMPL_SSE_EPILOGUE
5038 EPILOGUE_3_ARGS
5039ENDPROC iemAImpl_ptest_u128
5040
5041BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5042 PROLOGUE_3_ARGS
5043 IEMIMPL_SSE_PROLOGUE
5044
5045 vmovdqu ymm0, [A0]
5046 vmovdqu ymm1, [A1]
5047 vptest ymm0, ymm1
5048 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5049
5050 IEMIMPL_SSE_EPILOGUE
5051 EPILOGUE_3_ARGS
5052ENDPROC iemAImpl_vptest_u256
5053
5054
5055;;
5056; Template for the [v]pmov{s,z}x* instructions
5057;
5058; @param 1 The instruction
5059;
5060; @param A0 Pointer to the destination media register size operand (output).
5061; @param A1 The source operand value (input).
5062;
5063%macro IEMIMPL_V_PMOV_SZ_X 1
5064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5065 PROLOGUE_2_ARGS
5066 IEMIMPL_SSE_PROLOGUE
5067
5068 movd xmm0, A1
5069 %1 xmm0, xmm0
5070 vmovdqu [A0], xmm0
5071
5072 IEMIMPL_SSE_PROLOGUE
5073 EPILOGUE_2_ARGS
5074ENDPROC iemAImpl_ %+ %1 %+ _u128
5075
5076BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5077 PROLOGUE_2_ARGS
5078 IEMIMPL_AVX_PROLOGUE
5079
5080 movd xmm0, A1
5081 v %+ %1 xmm0, xmm0
5082 vmovdqu [A0], xmm0
5083
5084 IEMIMPL_AVX_PROLOGUE
5085 EPILOGUE_2_ARGS
5086ENDPROC iemAImpl_v %+ %1 %+ _u128
5087
5088BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5089 PROLOGUE_2_ARGS
5090 IEMIMPL_AVX_PROLOGUE
5091
5092 movdqu xmm0, [A1]
5093 v %+ %1 ymm0, xmm0
5094 vmovdqu [A0], ymm0
5095
5096 IEMIMPL_AVX_PROLOGUE
5097 EPILOGUE_2_ARGS
5098ENDPROC iemAImpl_v %+ %1 %+ _u256
5099%endmacro
5100
5101IEMIMPL_V_PMOV_SZ_X pmovsxbw
5102IEMIMPL_V_PMOV_SZ_X pmovsxbd
5103IEMIMPL_V_PMOV_SZ_X pmovsxbq
5104IEMIMPL_V_PMOV_SZ_X pmovsxwd
5105IEMIMPL_V_PMOV_SZ_X pmovsxwq
5106IEMIMPL_V_PMOV_SZ_X pmovsxdq
5107
5108IEMIMPL_V_PMOV_SZ_X pmovzxbw
5109IEMIMPL_V_PMOV_SZ_X pmovzxbd
5110IEMIMPL_V_PMOV_SZ_X pmovzxbq
5111IEMIMPL_V_PMOV_SZ_X pmovzxwd
5112IEMIMPL_V_PMOV_SZ_X pmovzxwq
5113IEMIMPL_V_PMOV_SZ_X pmovzxdq
5114
5115
5116;;
5117; Initialize the SSE MXCSR register using the guest value partially to
5118; account for rounding mode, load the value from the given register.
5119;
5120; @uses 4 bytes of stack to save the original value, T0.
5121; @param 1 Expression giving the register holding the guest's MXCSR.
5122;
5123%macro SSE_AVX_LD_MXCSR 1
5124 sub xSP, 4
5125
5126 stmxcsr [xSP]
5127 mov T0_32, %1
5128 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5129 or T0_32, X86_MXCSR_XCPT_MASK
5130 sub xSP, 4
5131 mov [xSP], T0_32
5132 ldmxcsr [xSP]
5133 add xSP, 4
5134%endmacro
5135
5136
5137;;
5138; Restores the SSE MXCSR register with the original value.
5139;
5140; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5141; @param 1 Expression giving the register to return the new guest's MXCSR value.
5142; @param 2 Expression giving the register holding original guest's MXCSR value.
5143;
5144; @note Restores the stack pointer.
5145;
5146%macro SSE_AVX_ST_MXCSR 2
5147 sub xSP, 4
5148 stmxcsr [xSP]
5149 mov %1, [xSP]
5150 add xSP, 4
5151 ; Merge the status bits into the original MXCSR value.
5152 and %1, X86_MXCSR_XCPT_FLAGS
5153 or %1, %2
5154
5155 ldmxcsr [xSP]
5156 add xSP, 4
5157%endmacro
5158
5159
5160;;
5161; Floating point instruction working on two full sized registers.
5162;
5163; @param 1 The instruction
5164; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5165;
5166; @returns R0_32 The new MXCSR value of the guest.
5167; @param A0 The guest's MXCSR register value to use.
5168; @param A1 Where to return the result.
5169; @param A2 Pointer to the first media register size operand (input/output).
5170; @param A3 Pointer to the second media register size operand (input).
5171;
5172%macro IEMIMPL_FP_F2 2
5173BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5174 PROLOGUE_4_ARGS
5175 IEMIMPL_SSE_PROLOGUE
5176 SSE_AVX_LD_MXCSR A0_32
5177
5178 movdqu xmm0, [A2]
5179 movdqu xmm1, [A3]
5180 %1 xmm0, xmm1
5181 movdqu [A1], xmm0
5182
5183 SSE_AVX_ST_MXCSR R0_32, A0_32
5184 IEMIMPL_SSE_PROLOGUE
5185 EPILOGUE_4_ARGS
5186ENDPROC iemAImpl_ %+ %1 %+ _u128
5187
5188 %if %2 == 3
5189BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5190 PROLOGUE_4_ARGS
5191 IEMIMPL_AVX_PROLOGUE
5192 SSE_AVX_LD_MXCSR A0_32
5193
5194 vmovdqu xmm0, [A2]
5195 vmovdqu xmm1, [A3]
5196 v %+ %1 xmm0, xmm0, xmm1
5197 vmovdqu [A1], xmm0
5198
5199 SSE_AVX_ST_MXCSR R0_32, A0_32
5200 IEMIMPL_AVX_PROLOGUE
5201 EPILOGUE_4_ARGS
5202ENDPROC iemAImpl_v %+ %1 %+ _u128
5203
5204BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5205 PROLOGUE_4_ARGS
5206 IEMIMPL_AVX_PROLOGUE
5207 SSE_AVX_LD_MXCSR A0_32
5208
5209 vmovdqu ymm0, [A2]
5210 vmovdqu ymm1, [A3]
5211 v %+ %1 ymm0, ymm0, ymm1
5212 vmovdqu [A1], ymm0
5213
5214 SSE_AVX_ST_MXCSR R0_32, A0_32
5215 IEMIMPL_AVX_PROLOGUE
5216 EPILOGUE_4_ARGS
5217ENDPROC iemAImpl_v %+ %1 %+ _u256
5218 %elif %2 == 2
5219BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5220 PROLOGUE_4_ARGS
5221 IEMIMPL_AVX_PROLOGUE
5222 SSE_AVX_LD_MXCSR A0_32
5223
5224 vmovdqu xmm0, [A2]
5225 vmovdqu xmm1, [A3]
5226 v %+ %1 xmm0, xmm1
5227 vmovdqu [A1], xmm0
5228
5229 SSE_AVX_ST_MXCSR R0_32, A0_32
5230 IEMIMPL_AVX_PROLOGUE
5231 EPILOGUE_4_ARGS
5232ENDPROC iemAImpl_v %+ %1 %+ _u128
5233
5234BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5235 PROLOGUE_4_ARGS
5236 IEMIMPL_AVX_PROLOGUE
5237 SSE_AVX_LD_MXCSR A0_32
5238
5239 vmovdqu ymm0, [A2]
5240 vmovdqu ymm1, [A3]
5241 v %+ %1 ymm0, ymm1
5242 vmovdqu [A1], ymm0
5243
5244 SSE_AVX_ST_MXCSR R0_32, A0_32
5245 IEMIMPL_AVX_PROLOGUE
5246 EPILOGUE_4_ARGS
5247ENDPROC iemAImpl_v %+ %1 %+ _u256
5248 %endif
5249%endmacro
5250
5251IEMIMPL_FP_F2 addps, 3
5252IEMIMPL_FP_F2 addpd, 3
5253IEMIMPL_FP_F2 mulps, 3
5254IEMIMPL_FP_F2 mulpd, 3
5255IEMIMPL_FP_F2 subps, 3
5256IEMIMPL_FP_F2 subpd, 3
5257IEMIMPL_FP_F2 minps, 3
5258IEMIMPL_FP_F2 minpd, 3
5259IEMIMPL_FP_F2 divps, 3
5260IEMIMPL_FP_F2 divpd, 3
5261IEMIMPL_FP_F2 maxps, 3
5262IEMIMPL_FP_F2 maxpd, 3
5263IEMIMPL_FP_F2 haddps, 3
5264IEMIMPL_FP_F2 haddpd, 3
5265IEMIMPL_FP_F2 hsubps, 3
5266IEMIMPL_FP_F2 hsubpd, 3
5267IEMIMPL_FP_F2 addsubps, 3
5268IEMIMPL_FP_F2 addsubpd, 3
5269
5270
5271;;
5272; These are actually unary operations but to keep it simple
5273; we treat them as binary for now, so the output result is
5274; always in sync with the register where the result might get written
5275; to.
5276IEMIMPL_FP_F2 sqrtps, 2
5277IEMIMPL_FP_F2 rsqrtps, 2
5278IEMIMPL_FP_F2 sqrtpd, 2
5279IEMIMPL_FP_F2 rcpps, 2
5280IEMIMPL_FP_F2 cvtdq2ps, 2
5281IEMIMPL_FP_F2 cvtps2dq, 2
5282IEMIMPL_FP_F2 cvttps2dq, 2
5283IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5284IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5285IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5286
5287
5288;;
5289; Floating point instruction working on a full sized register and a single precision operand.
5290;
5291; @param 1 The instruction
5292;
5293; @return R0_32 The new MXCSR value of the guest.
5294; @param A0 The guest's MXCSR register value to use.
5295; @param A1 Where to return the result.
5296; @param A2 Pointer to the first media register size operand (input/output).
5297; @param A3 Pointer to the second single precision floating point value (input).
5298;
5299%macro IEMIMPL_FP_F2_R32 1
5300BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5301 PROLOGUE_4_ARGS
5302 IEMIMPL_SSE_PROLOGUE
5303 SSE_AVX_LD_MXCSR A0_32
5304
5305 movdqu xmm0, [A2]
5306 movd xmm1, [A3]
5307 %1 xmm0, xmm1
5308 movdqu [A1], xmm0
5309
5310 SSE_AVX_ST_MXCSR R0_32, A0_32
5311 IEMIMPL_SSE_EPILOGUE
5312 EPILOGUE_4_ARGS
5313ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5314
5315BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5316 PROLOGUE_4_ARGS
5317 IEMIMPL_AVX_PROLOGUE
5318 SSE_AVX_LD_MXCSR A0_32
5319
5320 vmovdqu xmm0, [A2]
5321 vmovd xmm1, [A3]
5322 v %+ %1 xmm0, xmm0, xmm1
5323 vmovdqu [A1], xmm0
5324
5325 SSE_AVX_ST_MXCSR R0_32, A0_32
5326 IEMIMPL_AVX_PROLOGUE
5327 EPILOGUE_4_ARGS
5328ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5329%endmacro
5330
5331IEMIMPL_FP_F2_R32 addss
5332IEMIMPL_FP_F2_R32 mulss
5333IEMIMPL_FP_F2_R32 subss
5334IEMIMPL_FP_F2_R32 minss
5335IEMIMPL_FP_F2_R32 divss
5336IEMIMPL_FP_F2_R32 maxss
5337IEMIMPL_FP_F2_R32 cvtss2sd
5338IEMIMPL_FP_F2_R32 sqrtss
5339IEMIMPL_FP_F2_R32 rsqrtss
5340IEMIMPL_FP_F2_R32 rcpss
5341
5342
5343;;
5344; Floating point instruction working on a full sized register and a double precision operand.
5345;
5346; @param 1 The instruction
5347;
5348; @return R0_32 The new MXCSR value of the guest.
5349; @param A0 The guest's MXCSR register value to use.
5350; @param A1 Where to return the result.
5351; @param A2 Pointer to the first media register size operand (input/output).
5352; @param A3 Pointer to the second double precision floating point value (input).
5353;
5354%macro IEMIMPL_FP_F2_R64 1
5355BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5356 PROLOGUE_4_ARGS
5357 IEMIMPL_SSE_PROLOGUE
5358 SSE_AVX_LD_MXCSR A0_32
5359
5360 movdqu xmm0, [A2]
5361 movq xmm1, [A3]
5362 %1 xmm0, xmm1
5363 movdqu [A1], xmm0
5364
5365 SSE_AVX_ST_MXCSR R0_32, A0_32
5366 IEMIMPL_SSE_EPILOGUE
5367 EPILOGUE_4_ARGS
5368ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5369
5370BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5371 PROLOGUE_4_ARGS
5372 IEMIMPL_AVX_PROLOGUE
5373 SSE_AVX_LD_MXCSR A0_32
5374
5375 vmovdqu xmm0, [A2]
5376 vmovq xmm1, [A3]
5377 v %+ %1 xmm0, xmm0, xmm1
5378 vmovdqu [A1], xmm0
5379
5380 SSE_AVX_ST_MXCSR R0_32, A0_32
5381 IEMIMPL_AVX_EPILOGUE
5382 EPILOGUE_4_ARGS
5383ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5384%endmacro
5385
5386IEMIMPL_FP_F2_R64 addsd
5387IEMIMPL_FP_F2_R64 mulsd
5388IEMIMPL_FP_F2_R64 subsd
5389IEMIMPL_FP_F2_R64 minsd
5390IEMIMPL_FP_F2_R64 divsd
5391IEMIMPL_FP_F2_R64 maxsd
5392IEMIMPL_FP_F2_R64 cvtsd2ss
5393IEMIMPL_FP_F2_R64 sqrtsd
5394
5395
5396;;
5397; Macro for the cvtpd2ps/cvtps2pd instructions.
5398;
5399; 1 The instruction name.
5400; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5401;
5402; @return R0_32 The new MXCSR value of the guest.
5403; @param A0_32 The guest's MXCSR register value to use.
5404; @param A1 Where to return the result.
5405; @param A2 Pointer to the first media register size operand (input/output).
5406; @param A3 Pointer to the second media register size operand (input).
5407;
5408%macro IEMIMPL_CVT_F2 2
5409BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5410 PROLOGUE_4_ARGS
5411 IEMIMPL_SSE_PROLOGUE
5412 SSE_AVX_LD_MXCSR A0_32
5413
5414 movdqu xmm0, [A2]
5415 movdqu xmm1, [A3]
5416 %1 xmm0, xmm1
5417 movdqu [A1], xmm0
5418
5419 SSE_AVX_ST_MXCSR R0_32, A0_32
5420 IEMIMPL_SSE_EPILOGUE
5421 EPILOGUE_4_ARGS
5422ENDPROC iemAImpl_ %+ %1 %+ _u128
5423
5424BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5425 PROLOGUE_4_ARGS
5426 IEMIMPL_AVX_PROLOGUE
5427 SSE_AVX_LD_MXCSR A0_32
5428
5429 vmovdqu xmm0, [A2]
5430 vmovdqu xmm1, [A3]
5431 v %+ %1 xmm0, xmm1
5432 vmovdqu [A1], xmm0
5433
5434 SSE_AVX_ST_MXCSR R0_32, A0_32
5435 IEMIMPL_AVX_EPILOGUE
5436 EPILOGUE_4_ARGS
5437ENDPROC iemAImpl_v %+ %1 %+ _u128
5438
5439BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5440 PROLOGUE_4_ARGS
5441 IEMIMPL_AVX_PROLOGUE
5442 SSE_AVX_LD_MXCSR A0_32
5443
5444 vmovdqu ymm0, [A2]
5445 vmovdqu ymm1, [A3]
5446 %if %2 == 0
5447 v %+ %1 xmm0, ymm1
5448 %else
5449 v %+ %1 ymm0, xmm1
5450 %endif
5451 vmovdqu [A1], ymm0
5452
5453 SSE_AVX_ST_MXCSR R0_32, A0_32
5454 IEMIMPL_AVX_EPILOGUE
5455 EPILOGUE_4_ARGS
5456ENDPROC iemAImpl_v %+ %1 %+ _u256
5457%endmacro
5458
5459IEMIMPL_CVT_F2 cvtpd2ps, 0
5460IEMIMPL_CVT_F2 cvtps2pd, 1
5461
5462
5463;;
5464; shufps instructions with 8-bit immediates.
5465;
5466; @param A0 Pointer to the destination media register size operand (input/output).
5467; @param A1 Pointer to the first source media register size operand (input).
5468; @param A2 The 8-bit immediate
5469;
5470BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5471 PROLOGUE_3_ARGS
5472 IEMIMPL_SSE_PROLOGUE
5473
5474 movzx A2, A2_8 ; must clear top bits
5475 movdqu xmm0, [A0]
5476 movdqu xmm1, [A1]
5477 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5478 movdqu [A0], xmm0
5479
5480 IEMIMPL_SSE_EPILOGUE
5481 EPILOGUE_3_ARGS
5482 %assign bImm 0
5483 %rep 256
5484.imm %+ bImm:
5485 IBT_ENDBRxx_WITHOUT_NOTRACK
5486 shufps xmm0, xmm1, bImm
5487 ret
5488 int3
5489 %assign bImm bImm + 1
5490 %endrep
5491.immEnd:
5492ENDPROC iemAImpl_shufps_u128
5493
5494
5495;;
5496; shufpd instruction with 8-bit immediates.
5497;
5498; @param A0 Pointer to the destination media register size operand (input/output).
5499; @param A1 Pointer to the first source media register size operand (input).
5500; @param A2 The 8-bit immediate
5501;
5502BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5503 PROLOGUE_3_ARGS
5504 IEMIMPL_SSE_PROLOGUE
5505
5506 movzx A2, A2_8 ; must clear top bits
5507 movdqu xmm0, [A0]
5508 movdqu xmm1, [A1]
5509 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5510 movdqu [A0], xmm0
5511
5512 IEMIMPL_SSE_EPILOGUE
5513 EPILOGUE_3_ARGS
5514 %assign bImm 0
5515 %rep 256
5516.imm %+ bImm:
5517 IBT_ENDBRxx_WITHOUT_NOTRACK
5518 shufpd xmm0, xmm1, bImm
5519 ret
5520 %assign bImm bImm + 1
5521 %endrep
5522.immEnd:
5523ENDPROC iemAImpl_shufpd_u128
5524
5525
5526;;
5527; vshufp{s,d} instructions with 8-bit immediates.
5528;
5529; @param 1 The instruction name.
5530;
5531; @param A0 Pointer to the destination media register size operand (output).
5532; @param A1 Pointer to the first source media register size operand (input).
5533; @param A2 Pointer to the second source media register size operand (input).
5534; @param A3 The 8-bit immediate
5535;
5536%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5537BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5538 PROLOGUE_4_ARGS
5539 IEMIMPL_AVX_PROLOGUE
5540
5541 movzx A3, A3_8 ; must clear top bits
5542 movdqu xmm0, [A1]
5543 movdqu xmm1, [A2]
5544 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5545 movdqu [A0], xmm0
5546
5547 IEMIMPL_AVX_EPILOGUE
5548 EPILOGUE_4_ARGS
5549 %assign bImm 0
5550 %rep 256
5551.imm %+ bImm:
5552 IBT_ENDBRxx_WITHOUT_NOTRACK
5553 %1 xmm0, xmm0, xmm1, bImm
5554 ret
5555 %assign bImm bImm + 1
5556 %endrep
5557.immEnd:
5558ENDPROC iemAImpl_ %+ %1 %+ _u128
5559
5560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5561 PROLOGUE_4_ARGS
5562 IEMIMPL_AVX_PROLOGUE
5563
5564 movzx A3, A3_8 ; must clear top bits
5565 vmovdqu ymm0, [A1]
5566 vmovdqu ymm1, [A2]
5567 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5568 vmovdqu [A0], ymm0
5569
5570 IEMIMPL_AVX_EPILOGUE
5571 EPILOGUE_4_ARGS
5572 %assign bImm 0
5573 %rep 256
5574.imm %+ bImm:
5575 IBT_ENDBRxx_WITHOUT_NOTRACK
5576 %1 ymm0, ymm0, ymm1, bImm
5577 ret
5578 %assign bImm bImm + 1
5579 %endrep
5580.immEnd:
5581ENDPROC iemAImpl_ %+ %1 %+ _u256
5582%endmacro
5583
5584IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5585IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5586
5587
5588;;
5589; One of the [p]blendv{b,ps,pd} variants
5590;
5591; @param 1 The instruction
5592;
5593; @param A0 Pointer to the first media register sized operand (input/output).
5594; @param A1 Pointer to the second media sized value (input).
5595; @param A2 Pointer to the media register sized mask value (input).
5596;
5597%macro IEMIMPL_P_BLEND 1
5598BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5599 PROLOGUE_3_ARGS
5600 IEMIMPL_SSE_PROLOGUE
5601
5602 movdqu xmm0, [A2] ; This is implicit
5603 movdqu xmm1, [A0]
5604 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5605 %1 xmm1, xmm2
5606 movdqu [A0], xmm1
5607
5608 IEMIMPL_SSE_PROLOGUE
5609 EPILOGUE_3_ARGS
5610ENDPROC iemAImpl_ %+ %1 %+ _u128
5611%endmacro
5612
5613IEMIMPL_P_BLEND pblendvb
5614IEMIMPL_P_BLEND blendvps
5615IEMIMPL_P_BLEND blendvpd
5616
5617
5618;;
5619; One of the v[p]blendv{b,ps,pd} variants
5620;
5621; @param 1 The instruction
5622;
5623; @param A0 Pointer to the first media register sized operand (output).
5624; @param A1 Pointer to the first media register sized operand (input).
5625; @param A2 Pointer to the second media register sized operand (input).
5626; @param A3 Pointer to the media register sized mask value (input).
5627%macro IEMIMPL_AVX_P_BLEND 1
5628BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5629 PROLOGUE_4_ARGS
5630 IEMIMPL_AVX_PROLOGUE
5631
5632 vmovdqu xmm0, [A1]
5633 vmovdqu xmm1, [A2]
5634 vmovdqu xmm2, [A3]
5635 %1 xmm0, xmm0, xmm1, xmm2
5636 vmovdqu [A0], xmm0
5637
5638 IEMIMPL_AVX_PROLOGUE
5639 EPILOGUE_4_ARGS
5640ENDPROC iemAImpl_ %+ %1 %+ _u128
5641
5642BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5643 PROLOGUE_4_ARGS
5644 IEMIMPL_AVX_PROLOGUE
5645
5646 vmovdqu ymm0, [A1]
5647 vmovdqu ymm1, [A2]
5648 vmovdqu ymm2, [A3]
5649 %1 ymm0, ymm0, ymm1, ymm2
5650 vmovdqu [A0], ymm0
5651
5652 IEMIMPL_AVX_PROLOGUE
5653 EPILOGUE_4_ARGS
5654ENDPROC iemAImpl_ %+ %1 %+ _u256
5655%endmacro
5656
5657IEMIMPL_AVX_P_BLEND vpblendvb
5658IEMIMPL_AVX_P_BLEND vblendvps
5659IEMIMPL_AVX_P_BLEND vblendvpd
5660
5661
5662;;
5663; palignr mm1, mm2/m64 instruction.
5664;
5665; @param A0 Pointer to the first media register sized operand (output).
5666; @param A1 The second register sized operand (input).
5667; @param A2 The 8-bit immediate.
5668BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5669 PROLOGUE_3_ARGS
5670 IEMIMPL_MMX_PROLOGUE
5671
5672 movzx A2, A2_8 ; must clear top bits
5673 movq mm0, [A0]
5674 movq mm1, A1
5675 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5676 movq [A0], mm0
5677
5678 IEMIMPL_MMX_EPILOGUE
5679 EPILOGUE_3_ARGS
5680 %assign bImm 0
5681 %rep 256
5682.imm %+ bImm:
5683 IBT_ENDBRxx_WITHOUT_NOTRACK
5684 palignr mm0, mm1, bImm
5685 ret
5686 %assign bImm bImm + 1
5687 %endrep
5688.immEnd:
5689ENDPROC iemAImpl_palignr_u64
5690
5691
5692;;
5693; SSE instructions with 8-bit immediates of the form
5694; xxx xmm1, xmm2, imm8.
5695; where the instruction encoding takes up 6 bytes.
5696;
5697; @param 1 The instruction name.
5698;
5699; @param A0 Pointer to the first media register size operand (input/output).
5700; @param A1 Pointer to the second source media register size operand (input).
5701; @param A2 The 8-bit immediate
5702;
5703%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5705 PROLOGUE_3_ARGS
5706 IEMIMPL_SSE_PROLOGUE
5707
5708 movzx A2, A2_8 ; must clear top bits
5709 movdqu xmm0, [A0]
5710 movdqu xmm1, [A1]
5711 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
5712 movdqu [A0], xmm0
5713
5714 IEMIMPL_SSE_EPILOGUE
5715 EPILOGUE_3_ARGS
5716 %assign bImm 0
5717 %rep 256
5718.imm %+ bImm:
5719 IBT_ENDBRxx_WITHOUT_NOTRACK
5720 %1 xmm0, xmm1, bImm
5721 ret
5722 int3
5723 %assign bImm bImm + 1
5724 %endrep
5725.immEnd:
5726ENDPROC iemAImpl_ %+ %1 %+ _u128
5727%endmacro
5728
5729IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5730IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5731IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5732IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5733IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5734IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5735IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5736
5737
5738;;
5739; AVX instructions with 8-bit immediates of the form
5740; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5741; where the instruction encoding takes up 6 bytes.
5742;
5743; @param 1 The instruction name.
5744; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5745; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5746;
5747; @param A0 Pointer to the destination media register size operand (output).
5748; @param A1 Pointer to the first source media register size operand (input).
5749; @param A2 Pointer to the second source media register size operand (input).
5750; @param A3 The 8-bit immediate
5751;
5752%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5753 %if %2 == 1
5754BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5755 PROLOGUE_4_ARGS
5756 IEMIMPL_AVX_PROLOGUE
5757
5758 movzx A3, A3_8 ; must clear top bits
5759 movdqu xmm0, [A1]
5760 movdqu xmm1, [A2]
5761 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5762 movdqu [A0], xmm0
5763
5764 IEMIMPL_AVX_EPILOGUE
5765 EPILOGUE_4_ARGS
5766 %assign bImm 0
5767 %rep 256
5768.imm %+ bImm:
5769 IBT_ENDBRxx_WITHOUT_NOTRACK
5770 %1 xmm0, xmm0, xmm1, bImm
5771 ret
5772 int3
5773 %assign bImm bImm + 1
5774 %endrep
5775.immEnd:
5776ENDPROC iemAImpl_ %+ %1 %+ _u128
5777 %endif
5778
5779 %if %3 == 1
5780BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5781 PROLOGUE_4_ARGS
5782 IEMIMPL_AVX_PROLOGUE
5783
5784 movzx A3, A3_8 ; must clear top bits
5785 vmovdqu ymm0, [A1]
5786 vmovdqu ymm1, [A2]
5787 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5788 vmovdqu [A0], ymm0
5789
5790 IEMIMPL_AVX_EPILOGUE
5791 EPILOGUE_4_ARGS
5792 %assign bImm 0
5793 %rep 256
5794.imm %+ bImm:
5795 IBT_ENDBRxx_WITHOUT_NOTRACK
5796 %1 ymm0, ymm0, ymm1, bImm
5797 ret
5798 int3
5799 %assign bImm bImm + 1
5800 %endrep
5801.immEnd:
5802ENDPROC iemAImpl_ %+ %1 %+ _u256
5803 %endif
5804%endmacro
5805
5806IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5807IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5808IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5809IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5810IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5811IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5812IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5813IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5814IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5815
5816
5817;;
5818; AVX instructions with 8-bit immediates of the form
5819; xxx {x,y}mm1, {x,y}mm2, imm8.
5820; where the instruction encoding takes up 6 bytes.
5821;
5822; @param 1 The instruction name.
5823; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5824; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5825; @param 4 The number of bytes taken up by a single instance of the instruction.
5826;
5827; @param A0 Pointer to the destination media register size operand (output).
5828; @param A1 Pointer to the first source media register size operand (input).
5829; @param A2 The 8-bit immediate
5830;
5831%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
5832 %if %2 == 1
5833BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5834 PROLOGUE_4_ARGS
5835 IEMIMPL_AVX_PROLOGUE
5836
5837 movzx A2, A2_8 ; must clear top bits
5838 movdqu xmm1, [A1]
5839 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5840 movdqu [A0], xmm0
5841
5842 IEMIMPL_AVX_EPILOGUE
5843 EPILOGUE_4_ARGS
5844 %assign bImm 0
5845 %rep 256
5846.imm %+ bImm:
5847 IBT_ENDBRxx_WITHOUT_NOTRACK
5848 %1 xmm0, xmm1, bImm
5849 ret
5850 int3
5851 %assign bImm bImm + 1
5852 %endrep
5853.immEnd:
5854ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5855 %endif
5856
5857 %if %3 == 1
5858BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5859 PROLOGUE_4_ARGS
5860 IEMIMPL_AVX_PROLOGUE
5861
5862 movzx A2, A2_8 ; must clear top bits
5863 vmovdqu ymm1, [A1]
5864 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5865 vmovdqu [A0], ymm0
5866
5867 IEMIMPL_AVX_EPILOGUE
5868 EPILOGUE_4_ARGS
5869 %assign bImm 0
5870 %rep 256
5871.imm %+ bImm:
5872 IBT_ENDBRxx_WITHOUT_NOTRACK
5873 %1 ymm0, ymm1, bImm
5874 ret
5875 int3
5876 %assign bImm bImm + 1
5877 %endrep
5878.immEnd:
5879ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5880 %endif
5881%endmacro
5882
5883IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
5884IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
5885IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
5886IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
5887
5888
5889;;
5890; Need to move this as well somewhere better?
5891;
5892struc IEMPCMPISTRXSRC
5893 .uSrc1 resd 4
5894 .uSrc2 resd 4
5895endstruc
5896
5897struc IEMPCMPESTRXSRC
5898 .uSrc1 resd 4
5899 .uSrc2 resd 4
5900 .u64Rax resd 2
5901 .u64Rdx resd 2
5902endstruc
5903
5904;;
5905; The pcmpistri instruction.
5906;
5907; @param A0 Pointer to the ECX register to store the result to (output).
5908; @param A1 Pointer to the EFLAGS register.
5909; @param A2 Pointer to the structure containing the source operands (input).
5910; @param A3 The 8-bit immediate
5911;
5912BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5913 PROLOGUE_4_ARGS
5914 IEMIMPL_SSE_PROLOGUE
5915
5916 movzx A3, A3_8 ; must clear top bits
5917 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5918 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5919 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5920 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5921
5922 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5923 mov [T2], ecx
5924
5925 IEMIMPL_SSE_EPILOGUE
5926 EPILOGUE_4_ARGS
5927 %assign bImm 0
5928 %rep 256
5929.imm %+ bImm:
5930 IBT_ENDBRxx_WITHOUT_NOTRACK
5931 pcmpistri xmm0, xmm1, bImm
5932 ret
5933 int3
5934 %assign bImm bImm + 1
5935 %endrep
5936.immEnd:
5937ENDPROC iemAImpl_pcmpistri_u128
5938
5939;;
5940; The pcmpestri instruction.
5941;
5942; @param A0 Pointer to the ECX register to store the result to (output).
5943; @param A1 Pointer to the EFLAGS register.
5944; @param A2 Pointer to the structure containing the source operands (input).
5945; @param A3 The 8-bit immediate
5946;
5947BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5948 PROLOGUE_4_ARGS
5949 IEMIMPL_SSE_PROLOGUE
5950
5951 movzx A3, A3_8 ; must clear top bits
5952 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5953 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5954 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5955 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
5956 push xDX ; xDX can be A1 or A2 depending on the calling convention
5957 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5958 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5959 IBT_NOTRACK
5960 call T1
5961
5962 pop xDX
5963 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5964 mov [T2], ecx
5965
5966 IEMIMPL_SSE_EPILOGUE
5967 EPILOGUE_4_ARGS
5968 %assign bImm 0
5969 %rep 256
5970.imm %+ bImm:
5971 IBT_ENDBRxx_WITHOUT_NOTRACK
5972 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5973 pcmpestri xmm0, xmm1, bImm
5974 ret
5975 %assign bImm bImm + 1
5976 %endrep
5977.immEnd:
5978ENDPROC iemAImpl_pcmpestri_u128
5979
5980;;
5981; The pcmpistrm instruction template.
5982;
5983; @param A0 Pointer to the XMM0 register to store the result to (output).
5984; @param A1 Pointer to the EFLAGS register.
5985; @param A2 Pointer to the structure containing the source operands (input).
5986; @param A3 The 8-bit immediate
5987;
5988BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5989 PROLOGUE_4_ARGS
5990 IEMIMPL_SSE_PROLOGUE
5991
5992 movzx A3, A3_8 ; must clear top bits
5993 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5994 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5995 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5996
5997 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5998 movdqu [A0], xmm0
5999
6000 IEMIMPL_SSE_EPILOGUE
6001 EPILOGUE_4_ARGS
6002 %assign bImm 0
6003 %rep 256
6004.imm %+ bImm:
6005 IBT_ENDBRxx_WITHOUT_NOTRACK
6006 pcmpistrm xmm1, xmm2, bImm
6007 ret
6008 int3
6009 %assign bImm bImm + 1
6010 %endrep
6011.immEnd:
6012ENDPROC iemAImpl_pcmpistrm_u128
6013
6014;;
6015; The pcmpestrm instruction template.
6016;
6017; @param A0 Pointer to the XMM0 register to store the result to (output).
6018; @param A1 Pointer to the EFLAGS register.
6019; @param A2 Pointer to the structure containing the source operands (input).
6020; @param A3 The 8-bit immediate
6021;
6022BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6023 PROLOGUE_4_ARGS
6024 IEMIMPL_SSE_PROLOGUE
6025
6026 movzx A3, A3_8 ; must clear top bits
6027 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6028 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6029 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6030 push xDX ; xDX can be A1 or A2 depending on the calling convention
6031 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6032 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6033 IBT_NOTRACK
6034 call T1
6035
6036 pop xDX
6037 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6038 movdqu [A0], xmm0
6039
6040 IEMIMPL_SSE_EPILOGUE
6041 EPILOGUE_4_ARGS
6042 %assign bImm 0
6043 %rep 256
6044.imm %+ bImm:
6045 IBT_ENDBRxx_WITHOUT_NOTRACK
6046 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6047 pcmpestrm xmm1, xmm2, bImm
6048 ret
6049 %assign bImm bImm + 1
6050 %endrep
6051.immEnd:
6052ENDPROC iemAImpl_pcmpestrm_u128
6053
6054
6055;;
6056; movmskp{s,d} SSE instruction template
6057;
6058; @param 1 The SSE instruction name.
6059; @param 2 The AVX instruction name.
6060;
6061; @param A0 Pointer to the output register (output/byte sized).
6062; @param A1 Pointer to the source media register size operand (input).
6063;
6064%macro IEMIMPL_MEDIA_MOVMSK_P 2
6065BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6066 PROLOGUE_2_ARGS
6067 IEMIMPL_SSE_PROLOGUE
6068
6069 movdqu xmm0, [A1]
6070 %1 T0, xmm0
6071 mov byte [A0], T0_8
6072
6073 IEMIMPL_SSE_EPILOGUE
6074 EPILOGUE_2_ARGS
6075ENDPROC iemAImpl_ %+ %1 %+ _u128
6076
6077BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6078 PROLOGUE_2_ARGS
6079 IEMIMPL_AVX_PROLOGUE
6080
6081 movdqu xmm0, [A1]
6082 %2 T0, xmm0
6083 mov byte [A0], T0_8
6084
6085 IEMIMPL_AVX_EPILOGUE
6086 EPILOGUE_2_ARGS
6087ENDPROC iemAImpl_ %+ %2 %+ _u128
6088
6089BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6090 PROLOGUE_2_ARGS
6091 IEMIMPL_AVX_PROLOGUE
6092
6093 vmovdqu ymm0, [A1]
6094 %2 T0, ymm0
6095 mov byte [A0], T0_8
6096
6097 IEMIMPL_AVX_EPILOGUE
6098 EPILOGUE_2_ARGS
6099ENDPROC iemAImpl_ %+ %2 %+ _u256
6100%endmacro
6101
6102IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6103IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6104
6105
6106;;
6107; cvttsd2si instruction - 32-bit variant.
6108;
6109; @return R0_32 The new MXCSR value of the guest.
6110; @param A0_32 The guest's MXCSR register value to use.
6111; @param A1 Pointer to the result operand (output).
6112; @param A2 Pointer to the second operand (input).
6113;
6114BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6115 PROLOGUE_4_ARGS
6116 IEMIMPL_SSE_PROLOGUE
6117 SSE_AVX_LD_MXCSR A0_32
6118
6119 cvttsd2si T0_32, [A2]
6120 mov dword [A1], T0_32
6121
6122 SSE_AVX_ST_MXCSR R0_32, A0_32
6123 IEMIMPL_SSE_EPILOGUE
6124 EPILOGUE_4_ARGS
6125ENDPROC iemAImpl_cvttsd2si_i32_r64
6126
6127;;
6128; cvttsd2si instruction - 64-bit variant.
6129;
6130; @return R0_32 The new MXCSR value of the guest.
6131; @param A0_32 The guest's MXCSR register value to use.
6132; @param A1 Pointer to the result operand (output).
6133; @param A2 Pointer to the second operand (input).
6134;
6135BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6136 PROLOGUE_3_ARGS
6137 IEMIMPL_SSE_PROLOGUE
6138 SSE_AVX_LD_MXCSR A0_32
6139
6140 cvttsd2si T0, [A2]
6141 mov qword [A1], T0
6142
6143 SSE_AVX_ST_MXCSR R0_32, A0_32
6144 IEMIMPL_SSE_EPILOGUE
6145 EPILOGUE_3_ARGS
6146ENDPROC iemAImpl_cvttsd2si_i64_r64
6147
6148
6149;;
6150; cvtsd2si instruction - 32-bit variant.
6151;
6152; @return R0_32 The new MXCSR value of the guest.
6153; @param A0_32 The guest's MXCSR register value to use.
6154; @param A1 Pointer to the result operand (output).
6155; @param A2 Pointer to the second operand (input).
6156;
6157BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6158 PROLOGUE_3_ARGS
6159 IEMIMPL_SSE_PROLOGUE
6160 SSE_AVX_LD_MXCSR A0_32
6161
6162 cvtsd2si T0_32, [A2]
6163 mov dword [A1], T0_32
6164
6165 SSE_AVX_ST_MXCSR R0_32, A0_32
6166 IEMIMPL_SSE_EPILOGUE
6167 EPILOGUE_3_ARGS
6168ENDPROC iemAImpl_cvtsd2si_i32_r64
6169
6170;;
6171; cvtsd2si instruction - 64-bit variant.
6172;
6173; @return R0_32 The new MXCSR value of the guest.
6174; @param A0_32 The guest's MXCSR register value to use.
6175; @param A1 Pointer to the result operand (output).
6176; @param A2 Pointer to the second operand (input).
6177;
6178BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6179 PROLOGUE_3_ARGS
6180 IEMIMPL_SSE_PROLOGUE
6181 SSE_AVX_LD_MXCSR A0_32
6182
6183 cvtsd2si T0, [A2]
6184 mov qword [A1], T0
6185
6186 SSE_AVX_ST_MXCSR R0_32, A0_32
6187 IEMIMPL_SSE_EPILOGUE
6188 EPILOGUE_3_ARGS
6189ENDPROC iemAImpl_cvtsd2si_i64_r64
6190
6191
6192;;
6193; cvttss2si instruction - 32-bit variant.
6194;
6195; @return R0_32 The new MXCSR value of the guest.
6196; @param A0_32 The guest's MXCSR register value to use.
6197; @param A1 Pointer to the result operand (output).
6198; @param A2 Pointer to the second operand (input).
6199;
6200BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6201 PROLOGUE_3_ARGS
6202 IEMIMPL_SSE_PROLOGUE
6203 SSE_AVX_LD_MXCSR A0_32
6204
6205 cvttss2si T0_32, [A2]
6206 mov dword [A1], T0_32
6207
6208 SSE_AVX_ST_MXCSR R0_32, A0_32
6209 IEMIMPL_SSE_EPILOGUE
6210 EPILOGUE_3_ARGS
6211ENDPROC iemAImpl_cvttss2si_i32_r32
6212
6213;;
6214; cvttss2si instruction - 64-bit variant.
6215;
6216; @return R0_32 The new MXCSR value of the guest.
6217; @param A0_32 The guest's MXCSR register value to use.
6218; @param A1 Pointer to the result operand (output).
6219; @param A2 Pointer to the second operand (input).
6220;
6221BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6222 PROLOGUE_3_ARGS
6223 IEMIMPL_SSE_PROLOGUE
6224 SSE_AVX_LD_MXCSR A0_32
6225
6226 cvttss2si T0, [A2]
6227 mov qword [A1], T0
6228
6229 SSE_AVX_ST_MXCSR R0_32, A0_32
6230 IEMIMPL_SSE_EPILOGUE
6231 EPILOGUE_3_ARGS
6232ENDPROC iemAImpl_cvttss2si_i64_r32
6233
6234
6235;;
6236; cvtss2si instruction - 32-bit variant.
6237;
6238; @return R0_32 The new MXCSR value of the guest.
6239; @param A0_32 The guest's MXCSR register value to use.
6240; @param A1 Pointer to the result operand (output).
6241; @param A2 Pointer to the second operand (input).
6242;
6243BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6244 PROLOGUE_3_ARGS
6245 IEMIMPL_SSE_PROLOGUE
6246 SSE_AVX_LD_MXCSR A0_32
6247
6248 cvtss2si T0_32, [A2]
6249 mov dword [A1], T0_32
6250
6251 SSE_AVX_ST_MXCSR R0_32, A0_32
6252 IEMIMPL_SSE_EPILOGUE
6253 EPILOGUE_3_ARGS
6254ENDPROC iemAImpl_cvtss2si_i32_r32
6255
6256;;
6257; cvtss2si instruction - 64-bit variant.
6258;
6259; @return R0_32 The new MXCSR value of the guest.
6260; @param A0_32 The guest's MXCSR register value to use.
6261; @param A1 Pointer to the result operand (output).
6262; @param A2 Pointer to the second operand (input).
6263;
6264BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6265 PROLOGUE_3_ARGS
6266 IEMIMPL_SSE_PROLOGUE
6267 SSE_AVX_LD_MXCSR A0_32
6268
6269 cvtss2si T0, [A2]
6270 mov qword [A1], T0
6271
6272 SSE_AVX_ST_MXCSR R0_32, A0_32
6273 IEMIMPL_SSE_EPILOGUE
6274 EPILOGUE_3_ARGS
6275ENDPROC iemAImpl_cvtss2si_i64_r32
6276
6277
6278;;
6279; cvtsi2ss instruction - 32-bit variant.
6280;
6281; @return R0_32 The new MXCSR value of the guest.
6282; @param A0_32 The guest's MXCSR register value to use.
6283; @param A1 Pointer to the result operand (output).
6284; @param A2 Pointer to the second operand (input).
6285;
6286BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6287 PROLOGUE_3_ARGS
6288 IEMIMPL_SSE_PROLOGUE
6289 SSE_AVX_LD_MXCSR A0_32
6290
6291 cvtsi2ss xmm0, dword [A2]
6292 movd dword [A1], xmm0
6293
6294 SSE_AVX_ST_MXCSR R0_32, A0_32
6295 IEMIMPL_SSE_EPILOGUE
6296 EPILOGUE_3_ARGS
6297ENDPROC iemAImpl_cvtsi2ss_r32_i32
6298
6299;;
6300; cvtsi2ss instruction - 64-bit variant.
6301;
6302; @return R0_32 The new MXCSR value of the guest.
6303; @param A0_32 The guest's MXCSR register value to use.
6304; @param A1 Pointer to the result operand (output).
6305; @param A2 Pointer to the second operand (input).
6306;
6307BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6308 PROLOGUE_3_ARGS
6309 IEMIMPL_SSE_PROLOGUE
6310 SSE_AVX_LD_MXCSR A0_32
6311
6312 cvtsi2ss xmm0, qword [A2]
6313 movd dword [A1], xmm0
6314
6315 SSE_AVX_ST_MXCSR R0_32, A0_32
6316 IEMIMPL_SSE_EPILOGUE
6317 EPILOGUE_3_ARGS
6318ENDPROC iemAImpl_cvtsi2ss_r32_i64
6319
6320
6321;;
6322; cvtsi2sd instruction - 32-bit variant.
6323;
6324; @return R0_32 The new MXCSR value of the guest.
6325; @param A0_32 The guest's MXCSR register value to use.
6326; @param A1 Pointer to the result operand (output).
6327; @param A2 Pointer to the second operand (input).
6328;
6329BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6330 PROLOGUE_3_ARGS
6331 IEMIMPL_SSE_PROLOGUE
6332 SSE_AVX_LD_MXCSR A0_32
6333
6334 cvtsi2sd xmm0, dword [A2]
6335 movq [A1], xmm0
6336
6337 SSE_AVX_ST_MXCSR R0_32, A0_32
6338 IEMIMPL_SSE_EPILOGUE
6339 EPILOGUE_3_ARGS
6340ENDPROC iemAImpl_cvtsi2sd_r64_i32
6341
6342;;
6343; cvtsi2sd instruction - 64-bit variant.
6344;
6345; @return R0_32 The new MXCSR value of the guest.
6346; @param A0_32 The guest's MXCSR register value to use.
6347; @param A1 Pointer to the result operand (output).
6348; @param A2 Pointer to the second operand (input).
6349;
6350BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6351 PROLOGUE_3_ARGS
6352 IEMIMPL_SSE_PROLOGUE
6353 SSE_AVX_LD_MXCSR A0_32
6354
6355 cvtsi2sd xmm0, qword [A2]
6356 movq [A1], xmm0
6357
6358 SSE_AVX_ST_MXCSR R0_32, A0_32
6359 IEMIMPL_SSE_EPILOGUE
6360 EPILOGUE_3_ARGS
6361ENDPROC iemAImpl_cvtsi2sd_r64_i64
6362
6363
6364;
6365; UCOMISS (SSE)
6366;
6367; @return R0_32 The new MXCSR value of the guest.
6368; @param A0_32 The guest's MXCSR register value to use (input).
6369; @param A1 Pointer to the EFLAGS value (input/output).
6370; @param A2_32 The first source operand.
6371; @param A3_32 The second source operand.
6372;
6373BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6374 PROLOGUE_4_ARGS
6375 IEMIMPL_SSE_PROLOGUE
6376 SSE_AVX_LD_MXCSR A0_32
6377
6378 movd xmm0, A2_32
6379 movd xmm1, A3_32
6380 ucomiss xmm0, xmm1
6381 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6382
6383 SSE_AVX_ST_MXCSR R0_32, A0_32
6384 IEMIMPL_SSE_EPILOGUE
6385 EPILOGUE_4_ARGS
6386ENDPROC iemAImpl_ucomiss_u128
6387
6388BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6389 PROLOGUE_4_ARGS
6390 IEMIMPL_SSE_PROLOGUE
6391 SSE_AVX_LD_MXCSR A0_32
6392
6393 movd xmm0, A2_32
6394 movd xmm1, A3_32
6395 vucomiss xmm0, xmm1
6396 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6397
6398 SSE_AVX_ST_MXCSR R0_32, A0_32
6399 IEMIMPL_SSE_EPILOGUE
6400 EPILOGUE_3_ARGS
6401ENDPROC iemAImpl_vucomiss_u128
6402
6403
6404;
6405; UCOMISD (SSE)
6406;
6407; @return R0_32 The new MXCSR value of the guest.
6408; @param A0_32 The guest's MXCSR register value to use (input).
6409; @param A1 Pointer to the EFLAGS value (input/output).
6410; @param A2 The first source operand.
6411; @param A3 The second source operand.
6412;
6413BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6414 PROLOGUE_4_ARGS
6415 IEMIMPL_SSE_PROLOGUE
6416 SSE_AVX_LD_MXCSR A0_32
6417
6418 movq xmm0, A2
6419 movq xmm1, A3
6420 ucomisd xmm0, xmm1
6421 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6422
6423 SSE_AVX_ST_MXCSR R0_32, A0_32
6424 IEMIMPL_SSE_EPILOGUE
6425 EPILOGUE_4_ARGS
6426ENDPROC iemAImpl_ucomisd_u128
6427
6428BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6429 PROLOGUE_4_ARGS
6430 IEMIMPL_SSE_PROLOGUE
6431 SSE_AVX_LD_MXCSR A0_32
6432
6433 movq xmm0, A2
6434 movq xmm1, A3
6435 vucomisd xmm0, xmm1
6436 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6437
6438 SSE_AVX_ST_MXCSR R0_32, A0_32
6439 IEMIMPL_SSE_EPILOGUE
6440 EPILOGUE_4_ARGS
6441ENDPROC iemAImpl_vucomisd_u128
6442
6443;
6444; COMISS (SSE)
6445;
6446; @return R0_32 The new MXCSR value of the guest.
6447; @param A0_32 The guest's MXCSR register value to use (input).
6448; @param A1 Pointer to the EFLAGS value (input/output).
6449; @param A2_32 The first source operand.
6450; @param A3_32 The second source operand.
6451;
6452BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6453 PROLOGUE_4_ARGS
6454 IEMIMPL_SSE_PROLOGUE
6455 SSE_AVX_LD_MXCSR A0_32
6456
6457 movd xmm0, A2_32
6458 movd xmm1, A3_32
6459 comiss xmm0, xmm1
6460 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6461
6462 SSE_AVX_ST_MXCSR R0_32, A0_32
6463 IEMIMPL_SSE_EPILOGUE
6464 EPILOGUE_4_ARGS
6465ENDPROC iemAImpl_comiss_u128
6466
6467BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6468 PROLOGUE_4_ARGS
6469 IEMIMPL_SSE_PROLOGUE
6470 SSE_AVX_LD_MXCSR A0_32
6471
6472 movd xmm0, A2_32
6473 movd xmm1, A3_32
6474 vcomiss xmm0, xmm1
6475 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6476
6477 SSE_AVX_ST_MXCSR R0_32, A0_32
6478 IEMIMPL_SSE_EPILOGUE
6479 EPILOGUE_4_ARGS
6480ENDPROC iemAImpl_vcomiss_u128
6481
6482
6483;
6484; COMISD (SSE)
6485;
6486; @return R0_32 The new MXCSR value of the guest.
6487; @param A0_32 The guest's MXCSR register value to use (input).
6488; @param A1 Pointer to the EFLAGS value (input/output).
6489; @param A2 The first source operand.
6490; @param A3 The second source operand.
6491;
6492BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6493 PROLOGUE_4_ARGS
6494 IEMIMPL_SSE_PROLOGUE
6495 SSE_AVX_LD_MXCSR A0_32
6496
6497 movq xmm0, A2
6498 movq xmm1, A3
6499 comisd xmm0, xmm1
6500 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6501
6502 SSE_AVX_ST_MXCSR R0_32, A0_32
6503 IEMIMPL_SSE_EPILOGUE
6504 EPILOGUE_4_ARGS
6505ENDPROC iemAImpl_comisd_u128
6506
6507BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6508 PROLOGUE_4_ARGS
6509 IEMIMPL_SSE_PROLOGUE
6510 SSE_AVX_LD_MXCSR A0_32
6511
6512 movq xmm0, A2
6513 movq xmm1, A3
6514 vcomisd xmm0, xmm1
6515 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6516
6517 SSE_AVX_ST_MXCSR R0_32, A0_32
6518 IEMIMPL_SSE_EPILOGUE
6519 EPILOGUE_4_ARGS
6520ENDPROC iemAImpl_vcomisd_u128
6521
6522
6523;;
6524; Need to move this as well somewhere better?
6525;
6526struc IEMMEDIAF2XMMSRC
6527 .uSrc1 resd 4
6528 .uSrc2 resd 4
6529endstruc
6530
6531
6532;
6533; CMPPS (SSE)
6534;
6535; @return R0_32 The new MXCSR value of the guest.
6536; @param A0_32 The guest's MXCSR register value to use (input).
6537; @param A1 Pointer to the first media register size operand (output).
6538; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6539; @param A3 The 8-bit immediate (input).
6540;
6541BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6542 PROLOGUE_4_ARGS
6543 IEMIMPL_SSE_PROLOGUE
6544 SSE_AVX_LD_MXCSR A0_32
6545
6546 movzx A3, A3_8 ; must clear top bits
6547 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6548 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6549 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 5
6550 movdqu [A1], xmm0
6551
6552 SSE_AVX_ST_MXCSR R0_32, A0_32
6553 IEMIMPL_SSE_EPILOGUE
6554 EPILOGUE_4_ARGS
6555 %assign bImm 0
6556 %rep 256
6557.imm %+ bImm:
6558 IBT_ENDBRxx_WITHOUT_NOTRACK
6559 cmpps xmm0, xmm1, bImm
6560 ret
6561 %assign bImm bImm + 1
6562 %endrep
6563.immEnd:
6564ENDPROC iemAImpl_cmpps_u128
6565
6566;;
6567; SSE instructions with 8-bit immediates of the form
6568; xxx xmm1, xmm2, imm8.
6569; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6570; register.
6571;
6572; @param 1 The instruction name.
6573;
6574; @return R0_32 The new MXCSR value of the guest.
6575; @param A0_32 The guest's MXCSR register value to use (input).
6576; @param A1 Pointer to the first media register size operand (output).
6577; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6578; @param A3 The 8-bit immediate (input).
6579;
6580%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6581BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6582 PROLOGUE_4_ARGS
6583 IEMIMPL_SSE_PROLOGUE
6584 SSE_AVX_LD_MXCSR A0_32
6585
6586 movzx A3, A3_8 ; must clear top bits
6587 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6588 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6589 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
6590 movdqu [A1], xmm0
6591
6592 SSE_AVX_ST_MXCSR R0_32, A0_32
6593 IEMIMPL_SSE_EPILOGUE
6594 EPILOGUE_4_ARGS
6595 %assign bImm 0
6596 %rep 256
6597.imm %+ bImm:
6598 IBT_ENDBRxx_WITHOUT_NOTRACK
6599 %1 xmm0, xmm1, bImm
6600 ret
6601 %assign bImm bImm + 1
6602 %endrep
6603.immEnd:
6604ENDPROC iemAImpl_ %+ %1 %+ _u128
6605%endmacro
6606
6607IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6608IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6609IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6610
6611;;
6612; SSE instructions with 8-bit immediates of the form
6613; xxx xmm1, xmm2, imm8.
6614; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6615; register.
6616;
6617; @param 1 The instruction name.
6618;
6619; @return R0_32 The new MXCSR value of the guest.
6620; @param A0_32 The guest's MXCSR register value to use (input).
6621; @param A1 Pointer to the first media register size operand (output).
6622; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6623; @param A3 The 8-bit immediate (input).
6624;
6625%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6626BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6627 PROLOGUE_4_ARGS
6628 IEMIMPL_SSE_PROLOGUE
6629 SSE_AVX_LD_MXCSR A0_32
6630
6631 movzx A3, A3_8 ; must clear top bits
6632 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6633 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6634 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6635 movdqu [A1], xmm0
6636
6637 SSE_AVX_ST_MXCSR R0_32, A0_32
6638 IEMIMPL_SSE_EPILOGUE
6639 EPILOGUE_4_ARGS
6640 %assign bImm 0
6641 %rep 256
6642.imm %+ bImm:
6643 IBT_ENDBRxx_WITHOUT_NOTRACK
6644 %1 xmm0, xmm1, bImm
6645 ret
6646 int3
6647 %assign bImm bImm + 1
6648 %endrep
6649.immEnd:
6650ENDPROC iemAImpl_ %+ %1 %+ _u128
6651%endmacro
6652
6653IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6654IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6655IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6656IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6657IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6658IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6659
6660
6661;;
6662; SSE instructions of the form
6663; xxx mm, xmm.
6664; and we need to load and save the MXCSR register.
6665;
6666; @param 1 The instruction name.
6667;
6668; @return R0_32 The new MXCSR value of the guest.
6669; @param A0_32 The guest's MXCSR register value to use (input).
6670; @param A1 Pointer to the first MMX register sized operand (output).
6671; @param A2 Pointer to the media register sized operand (input).
6672;
6673%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6674BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6675 PROLOGUE_3_ARGS
6676 IEMIMPL_SSE_PROLOGUE
6677 SSE_AVX_LD_MXCSR A0_32
6678
6679 movdqu xmm0, [A2]
6680 %1 mm0, xmm0
6681 movq [A1], mm0
6682
6683 SSE_AVX_ST_MXCSR R0_32, A0_32
6684 IEMIMPL_SSE_EPILOGUE
6685 EPILOGUE_3_ARGS
6686ENDPROC iemAImpl_ %+ %1 %+ _u128
6687%endmacro
6688
6689IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6690IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6691
6692;;
6693; SSE instructions of the form
6694; xxx xmm, xmm/m64.
6695; and we need to load and save the MXCSR register.
6696;
6697; @param 1 The instruction name.
6698;
6699; @return R0_32 The new MXCSR value of the guest.
6700; @param A0_32 The guest's MXCSR register value to use (input).
6701; @param A1 Pointer to the first media register sized operand (input/output).
6702; @param A2 The 64bit source value from a MMX media register (input)
6703;
6704%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6706 PROLOGUE_3_ARGS
6707 IEMIMPL_SSE_PROLOGUE
6708 SSE_AVX_LD_MXCSR A0_32
6709
6710 movdqu xmm0, [A1]
6711 movq mm0, A2
6712 %1 xmm0, mm0
6713 movdqu [A1], xmm0
6714
6715 SSE_AVX_ST_MXCSR R0_32, A0_32
6716 IEMIMPL_SSE_EPILOGUE
6717 EPILOGUE_3_ARGS
6718ENDPROC iemAImpl_ %+ %1 %+ _u128
6719%endmacro
6720
6721IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6722IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6723
6724;;
6725; SSE instructions of the form
6726; xxx mm, xmm/m64.
6727; and we need to load and save the MXCSR register.
6728;
6729; @param 1 The instruction name.
6730;
6731; @return R0_32 The new MXCSR value of the guest.
6732; @param A0_32 The guest's MXCSR register value to use (input).
6733; @param A1 Pointer to the first MMX media register sized operand (output).
6734; @param A2 The 64bit source value (input).
6735;
6736%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6737BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6738 PROLOGUE_3_ARGS
6739 IEMIMPL_SSE_PROLOGUE
6740 SSE_AVX_LD_MXCSR A0_32
6741
6742 movq xmm0, A2
6743 %1 mm0, xmm0
6744 movq [A1], mm0
6745
6746 SSE_AVX_ST_MXCSR R0_32, A0_32
6747 IEMIMPL_SSE_EPILOGUE
6748 EPILOGUE_3_ARGS
6749ENDPROC iemAImpl_ %+ %1 %+ _u128
6750%endmacro
6751
6752IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6753IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6754
6755;
6756; All forms of RDRAND and RDSEED
6757;
6758; @param A0 Pointer to the destination operand.
6759; @param A1 Pointer to the EFLAGS value (input/output).
6760;
6761%macro IEMIMPL_RDRAND_RDSEED 3
6762BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6763 PROLOGUE_2_ARGS
6764
6765 %1 %2
6766 mov [A0], %2
6767 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
6768
6769 EPILOGUE_2_ARGS
6770ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6771%endmacro
6772
6773IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6774IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6775IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6776IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6777IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6778IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6779
6780
6781;;
6782; sha1rnds4 xmm1, xmm2, imm8.
6783;
6784; @param 1 The instruction name.
6785;
6786; @param A0 Pointer to the first media register size operand (input/output).
6787; @param A1 Pointer to the second source media register size operand (input).
6788; @param A2 The 8-bit immediate
6789;
6790BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6791 PROLOGUE_3_ARGS
6792 IEMIMPL_SSE_PROLOGUE
6793
6794 movzx A2, A2_8 ; must clear top bits
6795 movdqu xmm0, [A0]
6796 movdqu xmm1, [A1]
6797 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6798 movdqu [A0], xmm0
6799
6800 IEMIMPL_SSE_EPILOGUE
6801 EPILOGUE_3_ARGS
6802 %assign bImm 0
6803 %rep 256
6804.imm %+ bImm:
6805 IBT_ENDBRxx_WITHOUT_NOTRACK
6806 sha1rnds4 xmm0, xmm1, bImm
6807 ret
6808 %assign bImm bImm + 1
6809 %endrep
6810.immEnd:
6811ENDPROC iemAImpl_sha1rnds4_u128
6812
6813
6814;;
6815; sha256rnds2 xmm1, xmm2, <XMM0>.
6816;
6817; @param 1 The instruction name.
6818;
6819; @param A0 Pointer to the first media register size operand (input/output).
6820; @param A1 Pointer to the second source media register size operand (input).
6821; @param A2 Pointer to the implicit XMM0 constants (input).
6822;
6823BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6824 PROLOGUE_3_ARGS
6825 IEMIMPL_SSE_PROLOGUE
6826
6827 movdqu xmm0, [A2]
6828 movdqu xmm1, [A0]
6829 movdqu xmm2, [A1]
6830 sha256rnds2 xmm1, xmm2
6831 movdqu [A0], xmm1
6832
6833 IEMIMPL_SSE_EPILOGUE
6834 EPILOGUE_3_ARGS
6835ENDPROC iemAImpl_sha256rnds2_u128
6836
6837
6838;
6839; 32-bit forms of ADCX and ADOX
6840;
6841; @param A0 Pointer to the destination operand (input/output).
6842; @param A1 32-bit source operand 1 (input).
6843; @param A2 Pointer to the EFLAGS value (input/output).
6844;
6845%macro IEMIMPL_ADX_32 2
6846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6847 PROLOGUE_4_ARGS
6848
6849 IEM_LOAD_FLAGS_OLD A2, %2, 0
6850 %1 A1_32, [A0]
6851 mov [A0], A1_32
6852 IEM_SAVE_FLAGS_OLD A2, %2, 0, 0
6853
6854 EPILOGUE_4_ARGS
6855ENDPROC iemAImpl_ %+ %1 %+ _u32
6856%endmacro
6857
6858;
6859; 64-bit forms of ADCX and ADOX
6860;
6861; @param A0 Pointer to the destination operand (input/output).
6862; @param A1 64-bit source operand 1 (input).
6863; @param A2 Pointer to the EFLAGS value (input/output).
6864;
6865%macro IEMIMPL_ADX_64 2
6866BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6867 PROLOGUE_4_ARGS
6868
6869 IEM_LOAD_FLAGS_OLD A2, %2, 0
6870 %1 A1, [A0]
6871 mov [A0], A1
6872 IEM_SAVE_FLAGS_OLD A2, %2, 0, 0
6873
6874 EPILOGUE_4_ARGS
6875ENDPROC iemAImpl_ %+ %1 %+ _u64
6876%endmacro
6877
6878IEMIMPL_ADX_32 adcx, X86_EFL_CF
6879IEMIMPL_ADX_64 adcx, X86_EFL_CF
6880
6881IEMIMPL_ADX_32 adox, X86_EFL_OF
6882IEMIMPL_ADX_64 adox, X86_EFL_OF
6883
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette