VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 98797

Last change on this file since 98797 was 98781, checked in by vboxsync, 2 years ago

VMM/IEM: Fix the pcmp{e,i}str{i,m} instructions, completely got the explicit variants wrong, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 182.6 KB
Line 
1; $Id: IEMAllAImpl.asm 98781 2023-02-28 13:19:16Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Update the flag.
305;
306; @remarks Clobbers T0, T1, stack.
307; @param 1 The register pointing to the EFLAGS.
308; @param 2 The mask of modified flags to save.
309; @param 3 The mask of undefined flags to (maybe) save.
310;
311%macro IEM_SAVE_FLAGS 3
312 %if (%2 | %3) != 0
313 pushf
314 pop T1
315 mov T0_32, [%1] ; flags
316 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
317 and T1_32, (%2 | %3) ; select the modified and undefined flags.
318 or T0_32, T1_32 ; combine the flags.
319 mov [%1], T0_32 ; save the flags.
320 %endif
321%endmacro
322
323;;
324; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
325;
326; @remarks Clobbers T0, T1, stack.
327; @param 1 The register pointing to the EFLAGS.
328; @param 2 The mask of modified flags to save.
329; @param 3 Mask of additional flags to always clear
330; @param 4 Mask of additional flags to always set.
331;
332%macro IEM_SAVE_AND_ADJUST_FLAGS 4
333 %if (%2 | %3 | %4) != 0
334 pushf
335 pop T1
336 mov T0_32, [%1] ; load flags.
337 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
338 and T1_32, (%2) ; select the modified flags.
339 or T0_32, T1_32 ; combine the flags.
340 %if (%4) != 0
341 or T0_32, %4 ; add the always set flags.
342 %endif
343 mov [%1], T0_32 ; save the result.
344 %endif
345%endmacro
346
347;;
348; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
349; signed input (%4[%5]) and parity index (%6).
350;
351; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
352; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
353; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
354;
355; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
356; @param 1 The register pointing to the EFLAGS.
357; @param 2 The mask of modified flags to save.
358; @param 3 Mask of additional flags to always clear
359; @param 4 The result register to set SF by.
360; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
361; @param 6 The (full) register containing the parity table index. Will be modified!
362
363%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
364 %ifdef RT_ARCH_AMD64
365 pushf
366 pop T2
367 %else
368 push T0
369 pushf
370 pop T0
371 %endif
372 mov T1_32, [%1] ; load flags.
373 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
374 %ifdef RT_ARCH_AMD64
375 and T2_32, (%2) ; select the modified flags.
376 or T1_32, T2_32 ; combine the flags.
377 %else
378 and T0_32, (%2) ; select the modified flags.
379 or T1_32, T0_32 ; combine the flags.
380 pop T0
381 %endif
382
383 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
384 bt %4, %5 - 1
385 jnc %%sf_clear
386 or T1_32, X86_EFL_SF
387 %%sf_clear:
388
389 ; Parity last.
390 and %6, 0xff
391 %ifdef RT_ARCH_AMD64
392 lea T2, [NAME(g_afParity) xWrtRIP]
393 or T1_8, [T2 + %6]
394 %else
395 or T1_8, [NAME(g_afParity) + %6]
396 %endif
397
398 mov [%1], T1_32 ; save the result.
399%endmacro
400
401;;
402; Calculates the new EFLAGS using fixed clear and set bit masks.
403;
404; @remarks Clobbers T0.
405; @param 1 The register pointing to the EFLAGS.
406; @param 2 Mask of additional flags to always clear
407; @param 3 Mask of additional flags to always set.
408;
409%macro IEM_ADJUST_FLAGS 3
410 %if (%2 | %3) != 0
411 mov T0_32, [%1] ; Load flags.
412 %if (%2) != 0
413 and T0_32, ~(%2) ; Remove the always cleared flags.
414 %endif
415 %if (%3) != 0
416 or T0_32, %3 ; Add the always set flags.
417 %endif
418 mov [%1], T0_32 ; Save the result.
419 %endif
420%endmacro
421
422;;
423; Calculates the new EFLAGS using fixed clear and set bit masks.
424;
425; @remarks Clobbers T0, %4, EFLAGS.
426; @param 1 The register pointing to the EFLAGS.
427; @param 2 Mask of additional flags to always clear
428; @param 3 Mask of additional flags to always set.
429; @param 4 The (full) register containing the parity table index. Will be modified!
430;
431%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
432 mov T0_32, [%1] ; Load flags.
433 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 and %4, 0xff
438 %ifdef RT_ARCH_AMD64
439 lea T2, [NAME(g_afParity) xWrtRIP]
440 or T0_8, [T2 + %4]
441 %else
442 or T0_8, [NAME(g_afParity) + %4]
443 %endif
444 mov [%1], T0_32 ; Save the result.
445%endmacro
446
447
448;*********************************************************************************************************************************
449;* External Symbols *
450;*********************************************************************************************************************************
451extern NAME(g_afParity)
452
453
454;;
455; Macro for implementing a binary operator.
456;
457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
458; variants, except on 32-bit system where the 64-bit accesses requires hand
459; coding.
460;
461; All the functions takes a pointer to the destination memory operand in A0,
462; the source register operand in A1 and a pointer to eflags in A2.
463;
464; @param 1 The instruction mnemonic.
465; @param 2 Non-zero if there should be a locked version.
466; @param 3 The modified flags.
467; @param 4 The undefined flags.
468;
469%macro IEMIMPL_BIN_OP 4
470BEGINCODE
471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
472 PROLOGUE_3_ARGS
473 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474 %1 byte [A0], A1_8
475 IEM_SAVE_FLAGS A2, %3, %4
476 EPILOGUE_3_ARGS
477ENDPROC iemAImpl_ %+ %1 %+ _u8
478
479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
480 PROLOGUE_3_ARGS
481 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
482 %1 word [A0], A1_16
483 IEM_SAVE_FLAGS A2, %3, %4
484 EPILOGUE_3_ARGS
485ENDPROC iemAImpl_ %+ %1 %+ _u16
486
487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
488 PROLOGUE_3_ARGS
489 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
490 %1 dword [A0], A1_32
491 IEM_SAVE_FLAGS A2, %3, %4
492 EPILOGUE_3_ARGS
493ENDPROC iemAImpl_ %+ %1 %+ _u32
494
495 %ifdef RT_ARCH_AMD64
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 qword [A0], A1
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS_EX 8
502ENDPROC iemAImpl_ %+ %1 %+ _u64
503 %endif ; RT_ARCH_AMD64
504
505 %if %2 != 0 ; locked versions requested?
506
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 lock %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 lock %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 lock %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
539 %endif ; RT_ARCH_AMD64
540 %endif ; locked
541%endmacro
542
543; instr,lock, modified-flags, undefined flags
544IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
545IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
546IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
547IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
548IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
549IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
550IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
551IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
552IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
553
554
555;;
556; Macro for implementing a binary operator, VEX variant with separate input/output.
557;
558; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
559; where the 64-bit accesses requires hand coding.
560;
561; All the functions takes a pointer to the destination memory operand in A0,
562; the first source register operand in A1, the second source register operand
563; in A2 and a pointer to eflags in A3.
564;
565; @param 1 The instruction mnemonic.
566; @param 2 The modified flags.
567; @param 3 The undefined flags.
568;
569%macro IEMIMPL_VEX_BIN_OP 3
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0_32, A1_32, A2_32
574 mov [A0], T0_32
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u32
578
579 %ifdef RT_ARCH_AMD64
580BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
581 PROLOGUE_4_ARGS
582 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
583 %1 T0, A1, A2
584 mov [A0], T0
585 IEM_SAVE_FLAGS A3, %2, %3
586 EPILOGUE_4_ARGS
587ENDPROC iemAImpl_ %+ %1 %+ _u64
588 %endif ; RT_ARCH_AMD64
589%endmacro
590
591; instr, modified-flags, undefined-flags
592IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
593IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
594IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
595
596;;
597; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
598;
599; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
600; where the 64-bit accesses requires hand coding.
601;
602; All the functions takes a pointer to the destination memory operand in A0,
603; the source register operand in A1 and a pointer to eflags in A2.
604;
605; @param 1 The instruction mnemonic.
606; @param 2 The modified flags.
607; @param 3 The undefined flags.
608;
609%macro IEMIMPL_VEX_BIN_OP_2 3
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0_32, [A0]
614 %1 T0_32, A1_32
615 mov [A0], T0_32
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_4_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
624 mov T0, [A0]
625 %1 T0, A1
626 mov [A0], T0
627 IEM_SAVE_FLAGS A2, %2, %3
628 EPILOGUE_4_ARGS
629ENDPROC iemAImpl_ %+ %1 %+ _u64
630 %endif ; RT_ARCH_AMD64
631%endmacro
632
633; instr, modified-flags, undefined-flags
634IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
635IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
636IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
637
638
639;;
640; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
641;
642; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
643; where the 64-bit accesses requires hand coding.
644;
645; All the functions takes a pointer to the destination memory operand in A0,
646; the first source register operand in A1, the second source register operand
647; in A2 and a pointer to eflags in A3.
648;
649; @param 1 The instruction mnemonic.
650; @param 2 Fallback instruction if applicable.
651; @param 3 Whether to emit fallback or not.
652;
653%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
655 PROLOGUE_3_ARGS
656 %1 T0_32, A1_32, A2_32
657 mov [A0], T0_32
658 EPILOGUE_3_ARGS
659ENDPROC iemAImpl_ %+ %1 %+ _u32
660
661 %if %3
662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
663 PROLOGUE_3_ARGS
664 %ifdef ASM_CALL64_GCC
665 mov cl, A2_8
666 %2 A1_32, cl
667 mov [A0], A1_32
668 %else
669 xchg A2, A0
670 %2 A1_32, cl
671 mov [A2], A1_32
672 %endif
673 EPILOGUE_3_ARGS
674ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
675 %endif
676
677 %ifdef RT_ARCH_AMD64
678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
679 PROLOGUE_3_ARGS
680 %1 T0, A1, A2
681 mov [A0], T0
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64
684
685 %if %3
686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
687 PROLOGUE_3_ARGS
688 %ifdef ASM_CALL64_GCC
689 mov cl, A2_8
690 %2 A1, cl
691 mov [A0], A1_32
692 %else
693 xchg A2, A0
694 %2 A1, cl
695 mov [A2], A1_32
696 %endif
697 mov [A0], A1
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
700 %endif
701 %endif ; RT_ARCH_AMD64
702%endmacro
703
704; instr, fallback instr, emit fallback
705IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
706IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
707IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
708IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
709IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
710
711
712;
713; RORX uses a immediate byte for the shift count, so we only do
714; fallback implementation of that one.
715;
716BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
717 PROLOGUE_3_ARGS
718 %ifdef ASM_CALL64_GCC
719 mov cl, A2_8
720 ror A1_32, cl
721 mov [A0], A1_32
722 %else
723 xchg A2, A0
724 ror A1_32, cl
725 mov [A2], A1_32
726 %endif
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_rorx_u32
729
730 %ifdef RT_ARCH_AMD64
731BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
732 PROLOGUE_3_ARGS
733 %ifdef ASM_CALL64_GCC
734 mov cl, A2_8
735 ror A1, cl
736 mov [A0], A1_32
737 %else
738 xchg A2, A0
739 ror A1, cl
740 mov [A2], A1_32
741 %endif
742 mov [A0], A1
743 EPILOGUE_3_ARGS
744ENDPROC iemAImpl_rorx_u64
745 %endif ; RT_ARCH_AMD64
746
747
748;
749; MULX
750;
751BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
752 PROLOGUE_4_ARGS
753%ifdef ASM_CALL64_GCC
754 ; A2_32 is EDX - prefect
755 mulx T0_32, T1_32, A3_32
756 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
757 mov [A0], T0_32
758%else
759 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
760 xchg A1, A2
761 mulx T0_32, T1_32, A3_32
762 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
763 mov [A0], T0_32
764%endif
765 EPILOGUE_4_ARGS
766ENDPROC iemAImpl_mulx_u32
767
768
769BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX, T0_32 is EAX
773 mov eax, A3_32
774 mul A2_32
775 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
776 mov [A0], edx
777%else
778 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
779 xchg A1, A2
780 mov eax, A3_32
781 mul A2_32
782 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], edx
784%endif
785 EPILOGUE_4_ARGS
786ENDPROC iemAImpl_mulx_u32_fallback
787
788%ifdef RT_ARCH_AMD64
789BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
790 PROLOGUE_4_ARGS
791%ifdef ASM_CALL64_GCC
792 ; A2 is RDX - prefect
793 mulx T0, T1, A3
794 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
795 mov [A0], T0
796%else
797 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
798 xchg A1, A2
799 mulx T0, T1, A3
800 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], T0
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u64
805
806
807BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX, T0 is RAX
811 mov rax, A3
812 mul A2
813 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
814 mov [A0], rdx
815%else
816 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
817 xchg A1, A2
818 mov rax, A3
819 mul A2
820 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], rdx
822%endif
823 EPILOGUE_4_ARGS
824ENDPROC iemAImpl_mulx_u64_fallback
825
826%endif
827
828
829;;
830; Macro for implementing a bit operator.
831;
832; This will generate code for the 16, 32 and 64 bit accesses with locked
833; variants, except on 32-bit system where the 64-bit accesses requires hand
834; coding.
835;
836; All the functions takes a pointer to the destination memory operand in A0,
837; the source register operand in A1 and a pointer to eflags in A2.
838;
839; @param 1 The instruction mnemonic.
840; @param 2 Non-zero if there should be a locked version.
841; @param 3 The modified flags.
842; @param 4 The undefined flags.
843;
844%macro IEMIMPL_BIT_OP 4
845BEGINCODE
846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
847 PROLOGUE_3_ARGS
848 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
849 %1 word [A0], A1_16
850 IEM_SAVE_FLAGS A2, %3, %4
851 EPILOGUE_3_ARGS
852ENDPROC iemAImpl_ %+ %1 %+ _u16
853
854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
855 PROLOGUE_3_ARGS
856 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
857 %1 dword [A0], A1_32
858 IEM_SAVE_FLAGS A2, %3, %4
859 EPILOGUE_3_ARGS
860ENDPROC iemAImpl_ %+ %1 %+ _u32
861
862 %ifdef RT_ARCH_AMD64
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 qword [A0], A1
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS_EX 8
869ENDPROC iemAImpl_ %+ %1 %+ _u64
870 %endif ; RT_ARCH_AMD64
871
872 %if %2 != 0 ; locked versions requested?
873
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 lock %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 lock %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
898 %endif ; RT_ARCH_AMD64
899 %endif ; locked
900%endmacro
901IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
902IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
903IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
904IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
905
906;;
907; Macro for implementing a bit search operator.
908;
909; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
910; system where the 64-bit accesses requires hand coding.
911;
912; All the functions takes a pointer to the destination memory operand in A0,
913; the source register operand in A1 and a pointer to eflags in A2.
914;
915; In the ZF case the destination register is 'undefined', however it seems that
916; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
917; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
918; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
919; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
920;
921; @param 1 The instruction mnemonic.
922; @param 2 The modified flags.
923; @param 3 The undefined flags.
924; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
925;
926%macro IEMIMPL_BIT_OP2 4
927BEGINCODE
928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
931 %1 T0_16, A1_16
932%if %4 != 0
933 jz .unchanged_dst
934%endif
935 mov [A0], T0_16
936.unchanged_dst:
937 IEM_SAVE_FLAGS A2, %2, %3
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_ %+ %1 %+ _u16
940
941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
942 PROLOGUE_3_ARGS
943 %1 T1_16, A1_16
944%if %4 != 0
945 jz .unchanged_dst
946%endif
947 mov [A0], T1_16
948 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
949 EPILOGUE_3_ARGS
950.unchanged_dst:
951 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
952 EPILOGUE_3_ARGS
953ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
954
955BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
956 PROLOGUE_3_ARGS
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
966
967
968BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
969 PROLOGUE_3_ARGS
970 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
971 %1 T0_32, A1_32
972%if %4 != 0
973 jz .unchanged_dst
974%endif
975 mov [A0], T0_32
976.unchanged_dst:
977 IEM_SAVE_FLAGS A2, %2, %3
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u32
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
982 PROLOGUE_3_ARGS
983 %1 T1_32, A1_32
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T1_32
988 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
989 EPILOGUE_3_ARGS
990.unchanged_dst:
991 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
992 EPILOGUE_3_ARGS
993ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
994
995BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
996 PROLOGUE_3_ARGS
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1006
1007
1008 %ifdef RT_ARCH_AMD64
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1011 PROLOGUE_3_ARGS
1012 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1013 %1 T0, A1
1014%if %4 != 0
1015 jz .unchanged_dst
1016%endif
1017 mov [A0], T0
1018.unchanged_dst:
1019 IEM_SAVE_FLAGS A2, %2, %3
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64
1022
1023BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1024 PROLOGUE_3_ARGS
1025 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1026 %1 T1, A1
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T1
1031 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1032 EPILOGUE_3_ARGS
1033.unchanged_dst:
1034 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1035 EPILOGUE_3_ARGS
1036ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1037
1038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1039 PROLOGUE_3_ARGS
1040 %1 T0, A1
1041%if %4 != 0
1042 jz .unchanged_dst
1043%endif
1044 mov [A0], T0
1045.unchanged_dst:
1046 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1047 EPILOGUE_3_ARGS_EX 8
1048ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1049
1050 %endif ; RT_ARCH_AMD64
1051%endmacro
1052
1053IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1054IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1055IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1056IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1057
1058
1059;;
1060; Macro for implementing POPCNT.
1061;
1062; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1063; system where the 64-bit accesses requires hand coding.
1064;
1065; All the functions takes a pointer to the destination memory operand in A0,
1066; the source register operand in A1 and a pointer to eflags in A2.
1067;
1068; ASSUMES Intel and AMD set EFLAGS the same way.
1069;
1070; ASSUMES the instruction does not support memory destination.
1071;
1072; @param 1 The instruction mnemonic.
1073; @param 2 The modified flags.
1074; @param 3 The undefined flags.
1075;
1076%macro IEMIMPL_BIT_OP3 3
1077BEGINCODE
1078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1079 PROLOGUE_3_ARGS
1080 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1081 %1 T0_16, A1_16
1082 mov [A0], T0_16
1083 IEM_SAVE_FLAGS A2, %2, %3
1084 EPILOGUE_3_ARGS
1085ENDPROC iemAImpl_ %+ %1 %+ _u16
1086
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0_32, A1_32
1091 mov [A0], T0_32
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS
1094ENDPROC iemAImpl_ %+ %1 %+ _u32
1095
1096 %ifdef RT_ARCH_AMD64
1097BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1098 PROLOGUE_3_ARGS
1099 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1100 %1 T0, A1
1101 mov [A0], T0
1102 IEM_SAVE_FLAGS A2, %2, %3
1103 EPILOGUE_3_ARGS_EX 8
1104ENDPROC iemAImpl_ %+ %1 %+ _u64
1105 %endif ; RT_ARCH_AMD64
1106%endmacro
1107IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1108
1109
1110;
1111; IMUL is also a similar but yet different case (no lock, no mem dst).
1112; The rDX:rAX variant of imul is handled together with mul further down.
1113;
1114BEGINCODE
1115; @param 1 EFLAGS that are modified.
1116; @param 2 Undefined EFLAGS.
1117; @param 3 Function suffix.
1118; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1119; 2 for AMD (set AF, clear PF, ZF and SF).
1120%macro IEMIMPL_IMUL_TWO 4
1121BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1124 imul A1_16, word [A0]
1125 mov [A0], A1_16
1126 %if %4 != 1
1127 IEM_SAVE_FLAGS A2, %1, %2
1128 %else
1129 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1130 %endif
1131 EPILOGUE_3_ARGS
1132ENDPROC iemAImpl_imul_two_u16 %+ %3
1133
1134BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1135 PROLOGUE_3_ARGS
1136 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1137 imul A1_32, dword [A0]
1138 mov [A0], A1_32
1139 %if %4 != 1
1140 IEM_SAVE_FLAGS A2, %1, %2
1141 %else
1142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1143 %endif
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_imul_two_u32 %+ %3
1146
1147 %ifdef RT_ARCH_AMD64
1148BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1151 imul A1, qword [A0]
1152 mov [A0], A1
1153 %if %4 != 1
1154 IEM_SAVE_FLAGS A2, %1, %2
1155 %else
1156 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1157 %endif
1158 EPILOGUE_3_ARGS_EX 8
1159ENDPROC iemAImpl_imul_two_u64 %+ %3
1160 %endif ; RT_ARCH_AMD64
1161%endmacro
1162IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1163IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1164IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1165
1166
1167;
1168; XCHG for memory operands. This implies locking. No flag changes.
1169;
1170; Each function takes two arguments, first the pointer to the memory,
1171; then the pointer to the register. They all return void.
1172;
1173BEGINCODE
1174BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1175 PROLOGUE_2_ARGS
1176 mov T0_8, [A1]
1177 xchg [A0], T0_8
1178 mov [A1], T0_8
1179 EPILOGUE_2_ARGS
1180ENDPROC iemAImpl_xchg_u8_locked
1181
1182BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1183 PROLOGUE_2_ARGS
1184 mov T0_16, [A1]
1185 xchg [A0], T0_16
1186 mov [A1], T0_16
1187 EPILOGUE_2_ARGS
1188ENDPROC iemAImpl_xchg_u16_locked
1189
1190BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1191 PROLOGUE_2_ARGS
1192 mov T0_32, [A1]
1193 xchg [A0], T0_32
1194 mov [A1], T0_32
1195 EPILOGUE_2_ARGS
1196ENDPROC iemAImpl_xchg_u32_locked
1197
1198%ifdef RT_ARCH_AMD64
1199BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0, [A1]
1202 xchg [A0], T0
1203 mov [A1], T0
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u64_locked
1206%endif
1207
1208; Unlocked variants for fDisregardLock mode.
1209
1210BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1211 PROLOGUE_2_ARGS
1212 mov T0_8, [A1]
1213 mov T1_8, [A0]
1214 mov [A0], T0_8
1215 mov [A1], T1_8
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u8_unlocked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_16, [A1]
1222 mov T1_16, [A0]
1223 mov [A0], T0_16
1224 mov [A1], T1_16
1225 EPILOGUE_2_ARGS
1226ENDPROC iemAImpl_xchg_u16_unlocked
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_32, [A1]
1231 mov T1_32, [A0]
1232 mov [A0], T0_32
1233 mov [A1], T1_32
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u32_unlocked
1236
1237%ifdef RT_ARCH_AMD64
1238BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1239 PROLOGUE_2_ARGS
1240 mov T0, [A1]
1241 mov T1, [A0]
1242 mov [A0], T0
1243 mov [A1], T1
1244 EPILOGUE_2_ARGS
1245ENDPROC iemAImpl_xchg_u64_unlocked
1246%endif
1247
1248
1249;
1250; XADD for memory operands.
1251;
1252; Each function takes three arguments, first the pointer to the
1253; memory/register, then the pointer to the register, and finally a pointer to
1254; eflags. They all return void.
1255;
1256BEGINCODE
1257BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_8, [A1]
1261 xadd [A0], T0_8
1262 mov [A1], T0_8
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u8
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_16, [A1]
1271 xadd [A0], T0_16
1272 mov [A1], T0_16
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u16
1276
1277BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1278 PROLOGUE_3_ARGS
1279 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1280 mov T0_32, [A1]
1281 xadd [A0], T0_32
1282 mov [A1], T0_32
1283 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 EPILOGUE_3_ARGS
1285ENDPROC iemAImpl_xadd_u32
1286
1287%ifdef RT_ARCH_AMD64
1288BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1289 PROLOGUE_3_ARGS
1290 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 mov T0, [A1]
1292 xadd [A0], T0
1293 mov [A1], T0
1294 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 EPILOGUE_3_ARGS
1296ENDPROC iemAImpl_xadd_u64
1297%endif ; RT_ARCH_AMD64
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_8, [A1]
1303 lock xadd [A0], T0_8
1304 mov [A1], T0_8
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u8_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_16, [A1]
1313 lock xadd [A0], T0_16
1314 mov [A1], T0_16
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u16_locked
1318
1319BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1320 PROLOGUE_3_ARGS
1321 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1322 mov T0_32, [A1]
1323 lock xadd [A0], T0_32
1324 mov [A1], T0_32
1325 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 EPILOGUE_3_ARGS
1327ENDPROC iemAImpl_xadd_u32_locked
1328
1329%ifdef RT_ARCH_AMD64
1330BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 mov T0, [A1]
1334 lock xadd [A0], T0
1335 mov [A1], T0
1336 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 EPILOGUE_3_ARGS
1338ENDPROC iemAImpl_xadd_u64_locked
1339%endif ; RT_ARCH_AMD64
1340
1341
1342;
1343; CMPXCHG8B.
1344;
1345; These are tricky register wise, so the code is duplicated for each calling
1346; convention.
1347;
1348; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1349;
1350; C-proto:
1351; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1352; uint32_t *pEFlags));
1353;
1354; Note! Identical to iemAImpl_cmpxchg16b.
1355;
1356BEGINCODE
1357BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1358%ifdef RT_ARCH_AMD64
1359 %ifdef ASM_CALL64_MSC
1360 push rbx
1361
1362 mov r11, rdx ; pu64EaxEdx (is also T1)
1363 mov r10, rcx ; pu64Dst
1364
1365 mov ebx, [r8]
1366 mov ecx, [r8 + 4]
1367 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1368 mov eax, [r11]
1369 mov edx, [r11 + 4]
1370
1371 lock cmpxchg8b [r10]
1372
1373 mov [r11], eax
1374 mov [r11 + 4], edx
1375 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1376
1377 pop rbx
1378 ret
1379 %else
1380 push rbx
1381
1382 mov r10, rcx ; pEFlags
1383 mov r11, rdx ; pu64EbxEcx (is also T1)
1384
1385 mov ebx, [r11]
1386 mov ecx, [r11 + 4]
1387 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1388 mov eax, [rsi]
1389 mov edx, [rsi + 4]
1390
1391 lock cmpxchg8b [rdi]
1392
1393 mov [rsi], eax
1394 mov [rsi + 4], edx
1395 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1396
1397 pop rbx
1398 ret
1399
1400 %endif
1401%else
1402 push esi
1403 push edi
1404 push ebx
1405 push ebp
1406
1407 mov edi, ecx ; pu64Dst
1408 mov esi, edx ; pu64EaxEdx
1409 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1410 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1411
1412 mov ebx, [ecx]
1413 mov ecx, [ecx + 4]
1414 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1415 mov eax, [esi]
1416 mov edx, [esi + 4]
1417
1418 lock cmpxchg8b [edi]
1419
1420 mov [esi], eax
1421 mov [esi + 4], edx
1422 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1423
1424 pop ebp
1425 pop ebx
1426 pop edi
1427 pop esi
1428 ret 8
1429%endif
1430ENDPROC iemAImpl_cmpxchg8b
1431
1432BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1433 ; Lazy bird always lock prefixes cmpxchg8b.
1434 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1435ENDPROC iemAImpl_cmpxchg8b_locked
1436
1437%ifdef RT_ARCH_AMD64
1438
1439;
1440; CMPXCHG16B.
1441;
1442; These are tricky register wise, so the code is duplicated for each calling
1443; convention.
1444;
1445; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1446;
1447; C-proto:
1448; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1449; uint32_t *pEFlags));
1450;
1451; Note! Identical to iemAImpl_cmpxchg8b.
1452;
1453BEGINCODE
1454BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1455 %ifdef ASM_CALL64_MSC
1456 push rbx
1457
1458 mov r11, rdx ; pu64RaxRdx (is also T1)
1459 mov r10, rcx ; pu64Dst
1460
1461 mov rbx, [r8]
1462 mov rcx, [r8 + 8]
1463 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1464 mov rax, [r11]
1465 mov rdx, [r11 + 8]
1466
1467 lock cmpxchg16b [r10]
1468
1469 mov [r11], rax
1470 mov [r11 + 8], rdx
1471 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1472
1473 pop rbx
1474 ret
1475 %else
1476 push rbx
1477
1478 mov r10, rcx ; pEFlags
1479 mov r11, rdx ; pu64RbxRcx (is also T1)
1480
1481 mov rbx, [r11]
1482 mov rcx, [r11 + 8]
1483 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1484 mov rax, [rsi]
1485 mov rdx, [rsi + 8]
1486
1487 lock cmpxchg16b [rdi]
1488
1489 mov [rsi], rax
1490 mov [rsi + 8], rdx
1491 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1492
1493 pop rbx
1494 ret
1495
1496 %endif
1497ENDPROC iemAImpl_cmpxchg16b
1498
1499BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1500 ; Lazy bird always lock prefixes cmpxchg16b.
1501 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1502ENDPROC iemAImpl_cmpxchg16b_locked
1503
1504%endif ; RT_ARCH_AMD64
1505
1506
1507;
1508; CMPXCHG.
1509;
1510; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1511;
1512; C-proto:
1513; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1514;
1515BEGINCODE
1516%macro IEMIMPL_CMPXCHG 2
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov al, [A1]
1521 %1 cmpxchg [A0], A2_8
1522 mov [A1], al
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov ax, [A1]
1531 %1 cmpxchg [A0], A2_16
1532 mov [A1], ax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1538 PROLOGUE_4_ARGS
1539 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1540 mov eax, [A1]
1541 %1 cmpxchg [A0], A2_32
1542 mov [A1], eax
1543 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1544 EPILOGUE_4_ARGS
1545ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1546
1547BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1548%ifdef RT_ARCH_AMD64
1549 PROLOGUE_4_ARGS
1550 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1551 mov rax, [A1]
1552 %1 cmpxchg [A0], A2
1553 mov [A1], rax
1554 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1555 EPILOGUE_4_ARGS
1556%else
1557 ;
1558 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1559 ;
1560 push esi
1561 push edi
1562 push ebx
1563 push ebp
1564
1565 mov edi, ecx ; pu64Dst
1566 mov esi, edx ; pu64Rax
1567 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1568 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1569
1570 mov ebx, [ecx]
1571 mov ecx, [ecx + 4]
1572 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1573 mov eax, [esi]
1574 mov edx, [esi + 4]
1575
1576 lock cmpxchg8b [edi]
1577
1578 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1579 jz .cmpxchg8b_not_equal
1580 cmp eax, eax ; just set the other flags.
1581.store:
1582 mov [esi], eax
1583 mov [esi + 4], edx
1584 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1585
1586 pop ebp
1587 pop ebx
1588 pop edi
1589 pop esi
1590 ret 8
1591
1592.cmpxchg8b_not_equal:
1593 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1594 jne .store
1595 cmp [esi], eax
1596 jmp .store
1597
1598%endif
1599ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1600%endmacro ; IEMIMPL_CMPXCHG
1601
1602IEMIMPL_CMPXCHG , ,
1603IEMIMPL_CMPXCHG lock, _locked
1604
1605;;
1606; Macro for implementing a unary operator.
1607;
1608; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1609; variants, except on 32-bit system where the 64-bit accesses requires hand
1610; coding.
1611;
1612; All the functions takes a pointer to the destination memory operand in A0,
1613; the source register operand in A1 and a pointer to eflags in A2.
1614;
1615; @param 1 The instruction mnemonic.
1616; @param 2 The modified flags.
1617; @param 3 The undefined flags.
1618;
1619%macro IEMIMPL_UNARY_OP 3
1620BEGINCODE
1621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1622 PROLOGUE_2_ARGS
1623 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1624 %1 byte [A0]
1625 IEM_SAVE_FLAGS A1, %2, %3
1626 EPILOGUE_2_ARGS
1627ENDPROC iemAImpl_ %+ %1 %+ _u8
1628
1629BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1630 PROLOGUE_2_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1632 lock %1 byte [A0]
1633 IEM_SAVE_FLAGS A1, %2, %3
1634 EPILOGUE_2_ARGS
1635ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1636
1637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1638 PROLOGUE_2_ARGS
1639 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1640 %1 word [A0]
1641 IEM_SAVE_FLAGS A1, %2, %3
1642 EPILOGUE_2_ARGS
1643ENDPROC iemAImpl_ %+ %1 %+ _u16
1644
1645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1646 PROLOGUE_2_ARGS
1647 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1648 lock %1 word [A0]
1649 IEM_SAVE_FLAGS A1, %2, %3
1650 EPILOGUE_2_ARGS
1651ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1652
1653BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1654 PROLOGUE_2_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1656 %1 dword [A0]
1657 IEM_SAVE_FLAGS A1, %2, %3
1658 EPILOGUE_2_ARGS
1659ENDPROC iemAImpl_ %+ %1 %+ _u32
1660
1661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1662 PROLOGUE_2_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1664 lock %1 dword [A0]
1665 IEM_SAVE_FLAGS A1, %2, %3
1666 EPILOGUE_2_ARGS
1667ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1668
1669 %ifdef RT_ARCH_AMD64
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 qword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u64
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 qword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1685 %endif ; RT_ARCH_AMD64
1686
1687%endmacro
1688
1689IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1690IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1691IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1692IEMIMPL_UNARY_OP not, 0, 0
1693
1694
1695;
1696; BSWAP. No flag changes.
1697;
1698; Each function takes one argument, pointer to the value to bswap
1699; (input/output). They all return void.
1700;
1701BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1702 PROLOGUE_1_ARGS
1703 mov T0_32, [A0] ; just in case any of the upper bits are used.
1704 db 66h
1705 bswap T0_32
1706 mov [A0], T0_32
1707 EPILOGUE_1_ARGS
1708ENDPROC iemAImpl_bswap_u16
1709
1710BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1711 PROLOGUE_1_ARGS
1712 mov T0_32, [A0]
1713 bswap T0_32
1714 mov [A0], T0_32
1715 EPILOGUE_1_ARGS
1716ENDPROC iemAImpl_bswap_u32
1717
1718BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1719%ifdef RT_ARCH_AMD64
1720 PROLOGUE_1_ARGS
1721 mov T0, [A0]
1722 bswap T0
1723 mov [A0], T0
1724 EPILOGUE_1_ARGS
1725%else
1726 PROLOGUE_1_ARGS
1727 mov T0, [A0]
1728 mov T1, [A0 + 4]
1729 bswap T0
1730 bswap T1
1731 mov [A0 + 4], T0
1732 mov [A0], T1
1733 EPILOGUE_1_ARGS
1734%endif
1735ENDPROC iemAImpl_bswap_u64
1736
1737
1738;;
1739; Macro for implementing a shift operation.
1740;
1741; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1742; 32-bit system where the 64-bit accesses requires hand coding.
1743;
1744; All the functions takes a pointer to the destination memory operand in A0,
1745; the shift count in A1 and a pointer to eflags in A2.
1746;
1747; @param 1 The instruction mnemonic.
1748; @param 2 The modified flags.
1749; @param 3 The undefined flags.
1750;
1751; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1752;
1753; @note the _intel and _amd variants are implemented in C.
1754;
1755%macro IEMIMPL_SHIFT_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1758 PROLOGUE_3_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1760 %ifdef ASM_CALL64_GCC
1761 mov cl, A1_8
1762 %1 byte [A0], cl
1763 %else
1764 xchg A1, A0
1765 %1 byte [A1], cl
1766 %endif
1767 IEM_SAVE_FLAGS A2, %2, %3
1768 EPILOGUE_3_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1772 PROLOGUE_3_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1774 %ifdef ASM_CALL64_GCC
1775 mov cl, A1_8
1776 %1 word [A0], cl
1777 %else
1778 xchg A1, A0
1779 %1 word [A1], cl
1780 %endif
1781 IEM_SAVE_FLAGS A2, %2, %3
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1788 %ifdef ASM_CALL64_GCC
1789 mov cl, A1_8
1790 %1 dword [A0], cl
1791 %else
1792 xchg A1, A0
1793 %1 dword [A1], cl
1794 %endif
1795 IEM_SAVE_FLAGS A2, %2, %3
1796 EPILOGUE_3_ARGS
1797ENDPROC iemAImpl_ %+ %1 %+ _u32
1798
1799 %ifdef RT_ARCH_AMD64
1800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1801 PROLOGUE_3_ARGS
1802 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1803 %ifdef ASM_CALL64_GCC
1804 mov cl, A1_8
1805 %1 qword [A0], cl
1806 %else
1807 xchg A1, A0
1808 %1 qword [A1], cl
1809 %endif
1810 IEM_SAVE_FLAGS A2, %2, %3
1811 EPILOGUE_3_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813 %endif ; RT_ARCH_AMD64
1814
1815%endmacro
1816
1817IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1818IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1819IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1820IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1821IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1822IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1823IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1824
1825
1826;;
1827; Macro for implementing a double precision shift operation.
1828;
1829; This will generate code for the 16, 32 and 64 bit accesses, except on
1830; 32-bit system where the 64-bit accesses requires hand coding.
1831;
1832; The functions takes the destination operand (r/m) in A0, the source (reg) in
1833; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1834;
1835; @param 1 The instruction mnemonic.
1836; @param 2 The modified flags.
1837; @param 3 The undefined flags.
1838;
1839; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1840;
1841; @note the _intel and _amd variants are implemented in C.
1842;
1843%macro IEMIMPL_SHIFT_DBL_OP 3
1844BEGINCODE
1845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1846 PROLOGUE_4_ARGS
1847 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1848 %ifdef ASM_CALL64_GCC
1849 xchg A3, A2
1850 %1 [A0], A1_16, cl
1851 xchg A3, A2
1852 %else
1853 xchg A0, A2
1854 %1 [A2], A1_16, cl
1855 %endif
1856 IEM_SAVE_FLAGS A3, %2, %3
1857 EPILOGUE_4_ARGS
1858ENDPROC iemAImpl_ %+ %1 %+ _u16
1859
1860BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1861 PROLOGUE_4_ARGS
1862 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1863 %ifdef ASM_CALL64_GCC
1864 xchg A3, A2
1865 %1 [A0], A1_32, cl
1866 xchg A3, A2
1867 %else
1868 xchg A0, A2
1869 %1 [A2], A1_32, cl
1870 %endif
1871 IEM_SAVE_FLAGS A3, %2, %3
1872 EPILOGUE_4_ARGS
1873ENDPROC iemAImpl_ %+ %1 %+ _u32
1874
1875 %ifdef RT_ARCH_AMD64
1876BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1877 PROLOGUE_4_ARGS
1878 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1879 %ifdef ASM_CALL64_GCC
1880 xchg A3, A2
1881 %1 [A0], A1, cl
1882 xchg A3, A2
1883 %else
1884 xchg A0, A2
1885 %1 [A2], A1, cl
1886 %endif
1887 IEM_SAVE_FLAGS A3, %2, %3
1888 EPILOGUE_4_ARGS_EX 12
1889ENDPROC iemAImpl_ %+ %1 %+ _u64
1890 %endif ; RT_ARCH_AMD64
1891
1892%endmacro
1893
1894IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1895IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1896
1897
1898;;
1899; Macro for implementing a multiplication operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 so the caller can be used for div/idiv as well as
1909; for the mul/imul implementation.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 Name suffix.
1915; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1916;
1917; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1918;
1919%macro IEMIMPL_MUL_OP 5
1920BEGINCODE
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 mov al, [A0]
1925 %1 A1_8
1926 mov [A0], ax
1927 %if %5 != 1
1928 IEM_SAVE_FLAGS A2, %2, %3
1929 %else
1930 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1931 %endif
1932 xor eax, eax
1933 EPILOGUE_3_ARGS
1934ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1935
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1937 PROLOGUE_4_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1939 mov ax, [A0]
1940 %ifdef ASM_CALL64_GCC
1941 %1 A2_16
1942 mov [A0], ax
1943 mov [A1], dx
1944 %else
1945 mov T1, A1
1946 %1 A2_16
1947 mov [A0], ax
1948 mov [T1], dx
1949 %endif
1950 %if %5 != 1
1951 IEM_SAVE_FLAGS A3, %2, %3
1952 %else
1953 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1954 %endif
1955 xor eax, eax
1956 EPILOGUE_4_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1958
1959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1960 PROLOGUE_4_ARGS
1961 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1962 mov eax, [A0]
1963 %ifdef ASM_CALL64_GCC
1964 %1 A2_32
1965 mov [A0], eax
1966 mov [A1], edx
1967 %else
1968 mov T1, A1
1969 %1 A2_32
1970 mov [A0], eax
1971 mov [T1], edx
1972 %endif
1973 %if %5 != 1
1974 IEM_SAVE_FLAGS A3, %2, %3
1975 %else
1976 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1977 %endif
1978 xor eax, eax
1979 EPILOGUE_4_ARGS
1980ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1981
1982 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1984 PROLOGUE_4_ARGS
1985 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1986 mov rax, [A0]
1987 %ifdef ASM_CALL64_GCC
1988 %1 A2
1989 mov [A0], rax
1990 mov [A1], rdx
1991 %else
1992 mov T1, A1
1993 %1 A2
1994 mov [A0], rax
1995 mov [T1], rdx
1996 %endif
1997 %if %5 != 1
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 %else
2000 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2001 %endif
2002 xor eax, eax
2003 EPILOGUE_4_ARGS_EX 12
2004ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2005 %endif ; !RT_ARCH_AMD64
2006
2007%endmacro
2008
2009IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2010IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2011IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2012IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2013IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2014IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2015
2016
2017BEGINCODE
2018;;
2019; Worker function for negating a 32-bit number in T1:T0
2020; @uses None (T0,T1)
2021BEGINPROC iemAImpl_negate_T0_T1_u32
2022 push 0
2023 push 0
2024 xchg T0_32, [xSP]
2025 xchg T1_32, [xSP + xCB]
2026 sub T0_32, [xSP]
2027 sbb T1_32, [xSP + xCB]
2028 add xSP, xCB*2
2029 ret
2030ENDPROC iemAImpl_negate_T0_T1_u32
2031
2032%ifdef RT_ARCH_AMD64
2033;;
2034; Worker function for negating a 64-bit number in T1:T0
2035; @uses None (T0,T1)
2036BEGINPROC iemAImpl_negate_T0_T1_u64
2037 push 0
2038 push 0
2039 xchg T0, [xSP]
2040 xchg T1, [xSP + xCB]
2041 sub T0, [xSP]
2042 sbb T1, [xSP + xCB]
2043 add xSP, xCB*2
2044 ret
2045ENDPROC iemAImpl_negate_T0_T1_u64
2046%endif
2047
2048
2049;;
2050; Macro for implementing a division operations.
2051;
2052; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2053; 32-bit system where the 64-bit accesses requires hand coding.
2054;
2055; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2056; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2057; pointer to eflags in A3.
2058;
2059; The functions all return 0 on success and -1 if a divide error should be
2060; raised by the caller.
2061;
2062; @param 1 The instruction mnemonic.
2063; @param 2 The modified flags.
2064; @param 3 The undefined flags.
2065; @param 4 1 if signed, 0 if unsigned.
2066; @param 5 Function suffix.
2067; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2068; 2 for AMD (set AF, clear PF, ZF and SF).
2069;
2070; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2071;
2072%macro IEMIMPL_DIV_OP 6
2073BEGINCODE
2074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2075 PROLOGUE_3_ARGS
2076
2077 ; div by chainsaw check.
2078 test A1_8, A1_8
2079 jz .div_zero
2080
2081 ; Overflow check - unsigned division is simple to verify, haven't
2082 ; found a simple way to check signed division yet unfortunately.
2083 %if %4 == 0
2084 cmp [A0 + 1], A1_8
2085 jae .div_overflow
2086 %else
2087 mov T0_16, [A0] ; T0 = dividend
2088 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2089 test A1_8, A1_8
2090 js .divisor_negative
2091 test T0_16, T0_16
2092 jns .both_positive
2093 neg T0_16
2094.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2095 push T0 ; Start off like unsigned below.
2096 shr T0_16, 7
2097 cmp T0_8, A1_8
2098 pop T0
2099 jb .div_no_overflow
2100 ja .div_overflow
2101 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2102 cmp T0_8, A1_8
2103 jae .div_overflow
2104 jmp .div_no_overflow
2105
2106.divisor_negative:
2107 neg A1_8
2108 test T0_16, T0_16
2109 jns .one_of_each
2110 neg T0_16
2111.both_positive: ; Same as unsigned shifted by sign indicator bit.
2112 shr T0_16, 7
2113 cmp T0_8, A1_8
2114 jae .div_overflow
2115.div_no_overflow:
2116 mov A1, T1 ; restore divisor
2117 %endif
2118
2119 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2120 mov ax, [A0]
2121 %1 A1_8
2122 mov [A0], ax
2123 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2124 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2125 %else
2126 IEM_SAVE_FLAGS A2, %2, %3
2127 %endif
2128 xor eax, eax
2129
2130.return:
2131 EPILOGUE_3_ARGS
2132
2133.div_zero:
2134.div_overflow:
2135 mov eax, -1
2136 jmp .return
2137ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2138
2139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2140 PROLOGUE_4_ARGS
2141
2142 ; div by chainsaw check.
2143 test A2_16, A2_16
2144 jz .div_zero
2145
2146 ; Overflow check - unsigned division is simple to verify, haven't
2147 ; found a simple way to check signed division yet unfortunately.
2148 %if %4 == 0
2149 cmp [A1], A2_16
2150 jae .div_overflow
2151 %else
2152 mov T0_16, [A1]
2153 shl T0_32, 16
2154 mov T0_16, [A0] ; T0 = dividend
2155 mov T1, A2 ; T1 = divisor
2156 test T1_16, T1_16
2157 js .divisor_negative
2158 test T0_32, T0_32
2159 jns .both_positive
2160 neg T0_32
2161.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2162 push T0 ; Start off like unsigned below.
2163 shr T0_32, 15
2164 cmp T0_16, T1_16
2165 pop T0
2166 jb .div_no_overflow
2167 ja .div_overflow
2168 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2169 cmp T0_16, T1_16
2170 jae .div_overflow
2171 jmp .div_no_overflow
2172
2173.divisor_negative:
2174 neg T1_16
2175 test T0_32, T0_32
2176 jns .one_of_each
2177 neg T0_32
2178.both_positive: ; Same as unsigned shifted by sign indicator bit.
2179 shr T0_32, 15
2180 cmp T0_16, T1_16
2181 jae .div_overflow
2182.div_no_overflow:
2183 %endif
2184
2185 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2186 %ifdef ASM_CALL64_GCC
2187 mov T1, A2
2188 mov ax, [A0]
2189 mov dx, [A1]
2190 %1 T1_16
2191 mov [A0], ax
2192 mov [A1], dx
2193 %else
2194 mov T1, A1
2195 mov ax, [A0]
2196 mov dx, [T1]
2197 %1 A2_16
2198 mov [A0], ax
2199 mov [T1], dx
2200 %endif
2201 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2202 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2203 %else
2204 IEM_SAVE_FLAGS A3, %2, %3
2205 %endif
2206 xor eax, eax
2207
2208.return:
2209 EPILOGUE_4_ARGS
2210
2211.div_zero:
2212.div_overflow:
2213 mov eax, -1
2214 jmp .return
2215ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2216
2217BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2218 PROLOGUE_4_ARGS
2219
2220 ; div by chainsaw check.
2221 test A2_32, A2_32
2222 jz .div_zero
2223
2224 ; Overflow check - unsigned division is simple to verify, haven't
2225 ; found a simple way to check signed division yet unfortunately.
2226 %if %4 == 0
2227 cmp [A1], A2_32
2228 jae .div_overflow
2229 %else
2230 push A2 ; save A2 so we modify it (we out of regs on x86).
2231 mov T0_32, [A0] ; T0 = dividend low
2232 mov T1_32, [A1] ; T1 = dividend high
2233 test A2_32, A2_32
2234 js .divisor_negative
2235 test T1_32, T1_32
2236 jns .both_positive
2237 call NAME(iemAImpl_negate_T0_T1_u32)
2238.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2239 push T0 ; Start off like unsigned below.
2240 shl T1_32, 1
2241 shr T0_32, 31
2242 or T1_32, T0_32
2243 cmp T1_32, A2_32
2244 pop T0
2245 jb .div_no_overflow
2246 ja .div_overflow
2247 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2248 cmp T0_32, A2_32
2249 jae .div_overflow
2250 jmp .div_no_overflow
2251
2252.divisor_negative:
2253 neg A2_32
2254 test T1_32, T1_32
2255 jns .one_of_each
2256 call NAME(iemAImpl_negate_T0_T1_u32)
2257.both_positive: ; Same as unsigned shifted by sign indicator bit.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 jae .div_overflow
2263.div_no_overflow:
2264 pop A2
2265 %endif
2266
2267 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2268 mov eax, [A0]
2269 %ifdef ASM_CALL64_GCC
2270 mov T1, A2
2271 mov eax, [A0]
2272 mov edx, [A1]
2273 %1 T1_32
2274 mov [A0], eax
2275 mov [A1], edx
2276 %else
2277 mov T1, A1
2278 mov eax, [A0]
2279 mov edx, [T1]
2280 %1 A2_32
2281 mov [A0], eax
2282 mov [T1], edx
2283 %endif
2284 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2285 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2286 %else
2287 IEM_SAVE_FLAGS A3, %2, %3
2288 %endif
2289 xor eax, eax
2290
2291.return:
2292 EPILOGUE_4_ARGS
2293
2294.div_overflow:
2295 %if %4 != 0
2296 pop A2
2297 %endif
2298.div_zero:
2299 mov eax, -1
2300 jmp .return
2301ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2302
2303 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2305 PROLOGUE_4_ARGS
2306
2307 test A2, A2
2308 jz .div_zero
2309 %if %4 == 0
2310 cmp [A1], A2
2311 jae .div_overflow
2312 %else
2313 push A2 ; save A2 so we modify it (we out of regs on x86).
2314 mov T0, [A0] ; T0 = dividend low
2315 mov T1, [A1] ; T1 = dividend high
2316 test A2, A2
2317 js .divisor_negative
2318 test T1, T1
2319 jns .both_positive
2320 call NAME(iemAImpl_negate_T0_T1_u64)
2321.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2322 push T0 ; Start off like unsigned below.
2323 shl T1, 1
2324 shr T0, 63
2325 or T1, T0
2326 cmp T1, A2
2327 pop T0
2328 jb .div_no_overflow
2329 ja .div_overflow
2330 mov T1, 0x7fffffffffffffff
2331 and T0, T1 ; Special case for covering (divisor - 1).
2332 cmp T0, A2
2333 jae .div_overflow
2334 jmp .div_no_overflow
2335
2336.divisor_negative:
2337 neg A2
2338 test T1, T1
2339 jns .one_of_each
2340 call NAME(iemAImpl_negate_T0_T1_u64)
2341.both_positive: ; Same as unsigned shifted by sign indicator bit.
2342 shl T1, 1
2343 shr T0, 63
2344 or T1, T0
2345 cmp T1, A2
2346 jae .div_overflow
2347.div_no_overflow:
2348 pop A2
2349 %endif
2350
2351 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2352 mov rax, [A0]
2353 %ifdef ASM_CALL64_GCC
2354 mov T1, A2
2355 mov rax, [A0]
2356 mov rdx, [A1]
2357 %1 T1
2358 mov [A0], rax
2359 mov [A1], rdx
2360 %else
2361 mov T1, A1
2362 mov rax, [A0]
2363 mov rdx, [T1]
2364 %1 A2
2365 mov [A0], rax
2366 mov [T1], rdx
2367 %endif
2368 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2369 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2370 %else
2371 IEM_SAVE_FLAGS A3, %2, %3
2372 %endif
2373 xor eax, eax
2374
2375.return:
2376 EPILOGUE_4_ARGS_EX 12
2377
2378.div_overflow:
2379 %if %4 != 0
2380 pop A2
2381 %endif
2382.div_zero:
2383 mov eax, -1
2384 jmp .return
2385ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2386 %endif ; !RT_ARCH_AMD64
2387
2388%endmacro
2389
2390IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2391IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2392IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2393IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2394IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2395IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2396
2397
2398;;
2399; Macro for implementing memory fence operation.
2400;
2401; No return value, no operands or anything.
2402;
2403; @param 1 The instruction.
2404;
2405%macro IEMIMPL_MEM_FENCE 1
2406BEGINCODE
2407BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2408 %1
2409 ret
2410ENDPROC iemAImpl_ %+ %1
2411%endmacro
2412
2413IEMIMPL_MEM_FENCE lfence
2414IEMIMPL_MEM_FENCE sfence
2415IEMIMPL_MEM_FENCE mfence
2416
2417;;
2418; Alternative for non-SSE2 host.
2419;
2420BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2421 push xAX
2422 xchg xAX, [xSP]
2423 add xSP, xCB
2424 ret
2425ENDPROC iemAImpl_alt_mem_fence
2426
2427
2428;;
2429; Initialize the FPU for the actual instruction being emulated, this means
2430; loading parts of the guest's control word and status word.
2431;
2432; @uses 24 bytes of stack. T0, T1
2433; @param 1 Expression giving the address of the FXSTATE of the guest.
2434;
2435%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2436 fnstenv [xSP]
2437
2438 ; FCW - for exception, precision and rounding control.
2439 movzx T0, word [%1 + X86FXSTATE.FCW]
2440 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2441 mov [xSP + X86FSTENV32P.FCW], T0_16
2442
2443 ; FSW - for undefined C0, C1, C2, and C3.
2444 movzx T1, word [%1 + X86FXSTATE.FSW]
2445 and T1, X86_FSW_C_MASK
2446 movzx T0, word [xSP + X86FSTENV32P.FSW]
2447 and T0, X86_FSW_TOP_MASK
2448 or T0, T1
2449 mov [xSP + X86FSTENV32P.FSW], T0_16
2450
2451 fldenv [xSP]
2452%endmacro
2453
2454
2455;;
2456; Initialize the FPU for the actual instruction being emulated, this means
2457; loading parts of the guest's control word, status word, and update the
2458; tag word for the top register if it's empty.
2459;
2460; ASSUMES actual TOP=7
2461;
2462; @uses 24 bytes of stack. T0, T1
2463; @param 1 Expression giving the address of the FXSTATE of the guest.
2464;
2465%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2466 fnstenv [xSP]
2467
2468 ; FCW - for exception, precision and rounding control.
2469 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2470 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2471 mov [xSP + X86FSTENV32P.FCW], T0_16
2472
2473 ; FSW - for undefined C0, C1, C2, and C3.
2474 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2475 and T1_32, X86_FSW_C_MASK
2476 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2477 and T0_32, X86_FSW_TOP_MASK
2478 or T0_32, T1_32
2479 mov [xSP + X86FSTENV32P.FSW], T0_16
2480
2481 ; FTW - Only for ST0 (in/out).
2482 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2483 shr T1_32, X86_FSW_TOP_SHIFT
2484 and T1_32, X86_FSW_TOP_SMASK
2485 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2486 jc %%st0_not_empty
2487 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2488%%st0_not_empty:
2489
2490 fldenv [xSP]
2491%endmacro
2492
2493
2494;;
2495; Need to move this as well somewhere better?
2496;
2497struc IEMFPURESULT
2498 .r80Result resw 5
2499 .FSW resw 1
2500endstruc
2501
2502
2503;;
2504; Need to move this as well somewhere better?
2505;
2506struc IEMFPURESULTTWO
2507 .r80Result1 resw 5
2508 .FSW resw 1
2509 .r80Result2 resw 5
2510endstruc
2511
2512
2513;
2514;---------------------- 16-bit signed integer operations ----------------------
2515;
2516
2517
2518;;
2519; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2520;
2521; @param A0 FPU context (fxsave).
2522; @param A1 Pointer to a IEMFPURESULT for the output.
2523; @param A2 Pointer to the 16-bit floating point value to convert.
2524;
2525BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2526 PROLOGUE_3_ARGS
2527 sub xSP, 20h
2528
2529 fninit
2530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2531 fild word [A2]
2532
2533 fnstsw word [A1 + IEMFPURESULT.FSW]
2534 fnclex
2535 fstp tword [A1 + IEMFPURESULT.r80Result]
2536
2537 fninit
2538 add xSP, 20h
2539 EPILOGUE_3_ARGS
2540ENDPROC iemAImpl_fild_r80_from_i16
2541
2542
2543;;
2544; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2545;
2546; @param A0 FPU context (fxsave).
2547; @param A1 Where to return the output FSW.
2548; @param A2 Where to store the 16-bit signed integer value.
2549; @param A3 Pointer to the 80-bit value.
2550;
2551BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2552 PROLOGUE_4_ARGS
2553 sub xSP, 20h
2554
2555 fninit
2556 fld tword [A3]
2557 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2558 fistp word [A2]
2559
2560 fnstsw word [A1]
2561
2562 fninit
2563 add xSP, 20h
2564 EPILOGUE_4_ARGS
2565ENDPROC iemAImpl_fist_r80_to_i16
2566
2567
2568;;
2569; Store a 80-bit floating point value (register) as a 16-bit signed integer
2570; (memory) with truncation.
2571;
2572; @param A0 FPU context (fxsave).
2573; @param A1 Where to return the output FSW.
2574; @param A2 Where to store the 16-bit signed integer value.
2575; @param A3 Pointer to the 80-bit value.
2576;
2577BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2578 PROLOGUE_4_ARGS
2579 sub xSP, 20h
2580
2581 fninit
2582 fld tword [A3]
2583 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2584 fisttp word [A2]
2585
2586 fnstsw word [A1]
2587
2588 fninit
2589 add xSP, 20h
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_fistt_r80_to_i16
2592
2593
2594;;
2595; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2596;
2597; @param 1 The instruction
2598;
2599; @param A0 FPU context (fxsave).
2600; @param A1 Pointer to a IEMFPURESULT for the output.
2601; @param A2 Pointer to the 80-bit value.
2602; @param A3 Pointer to the 16-bit value.
2603;
2604%macro IEMIMPL_FPU_R80_BY_I16 1
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2606 PROLOGUE_4_ARGS
2607 sub xSP, 20h
2608
2609 fninit
2610 fld tword [A2]
2611 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2612 %1 word [A3]
2613
2614 fnstsw word [A1 + IEMFPURESULT.FSW]
2615 fnclex
2616 fstp tword [A1 + IEMFPURESULT.r80Result]
2617
2618 fninit
2619 add xSP, 20h
2620 EPILOGUE_4_ARGS
2621ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2622%endmacro
2623
2624IEMIMPL_FPU_R80_BY_I16 fiadd
2625IEMIMPL_FPU_R80_BY_I16 fimul
2626IEMIMPL_FPU_R80_BY_I16 fisub
2627IEMIMPL_FPU_R80_BY_I16 fisubr
2628IEMIMPL_FPU_R80_BY_I16 fidiv
2629IEMIMPL_FPU_R80_BY_I16 fidivr
2630
2631
2632;;
2633; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2634; only returning FSW.
2635;
2636; @param 1 The instruction
2637;
2638; @param A0 FPU context (fxsave).
2639; @param A1 Where to store the output FSW.
2640; @param A2 Pointer to the 80-bit value.
2641; @param A3 Pointer to the 64-bit value.
2642;
2643%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2645 PROLOGUE_4_ARGS
2646 sub xSP, 20h
2647
2648 fninit
2649 fld tword [A2]
2650 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2651 %1 word [A3]
2652
2653 fnstsw word [A1]
2654
2655 fninit
2656 add xSP, 20h
2657 EPILOGUE_4_ARGS
2658ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2659%endmacro
2660
2661IEMIMPL_FPU_R80_BY_I16_FSW ficom
2662
2663
2664
2665;
2666;---------------------- 32-bit signed integer operations ----------------------
2667;
2668
2669
2670;;
2671; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Pointer to a IEMFPURESULT for the output.
2675; @param A2 Pointer to the 32-bit floating point value to convert.
2676;
2677BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2678 PROLOGUE_3_ARGS
2679 sub xSP, 20h
2680
2681 fninit
2682 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2683 fild dword [A2]
2684
2685 fnstsw word [A1 + IEMFPURESULT.FSW]
2686 fnclex
2687 fstp tword [A1 + IEMFPURESULT.r80Result]
2688
2689 fninit
2690 add xSP, 20h
2691 EPILOGUE_3_ARGS
2692ENDPROC iemAImpl_fild_r80_from_i32
2693
2694
2695;;
2696; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2697;
2698; @param A0 FPU context (fxsave).
2699; @param A1 Where to return the output FSW.
2700; @param A2 Where to store the 32-bit signed integer value.
2701; @param A3 Pointer to the 80-bit value.
2702;
2703BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2704 PROLOGUE_4_ARGS
2705 sub xSP, 20h
2706
2707 fninit
2708 fld tword [A3]
2709 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2710 fistp dword [A2]
2711
2712 fnstsw word [A1]
2713
2714 fninit
2715 add xSP, 20h
2716 EPILOGUE_4_ARGS
2717ENDPROC iemAImpl_fist_r80_to_i32
2718
2719
2720;;
2721; Store a 80-bit floating point value (register) as a 32-bit signed integer
2722; (memory) with truncation.
2723;
2724; @param A0 FPU context (fxsave).
2725; @param A1 Where to return the output FSW.
2726; @param A2 Where to store the 32-bit signed integer value.
2727; @param A3 Pointer to the 80-bit value.
2728;
2729BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2730 PROLOGUE_4_ARGS
2731 sub xSP, 20h
2732
2733 fninit
2734 fld tword [A3]
2735 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2736 fisttp dword [A2]
2737
2738 fnstsw word [A1]
2739
2740 fninit
2741 add xSP, 20h
2742 EPILOGUE_4_ARGS
2743ENDPROC iemAImpl_fistt_r80_to_i32
2744
2745
2746;;
2747; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2748;
2749; @param 1 The instruction
2750;
2751; @param A0 FPU context (fxsave).
2752; @param A1 Pointer to a IEMFPURESULT for the output.
2753; @param A2 Pointer to the 80-bit value.
2754; @param A3 Pointer to the 32-bit value.
2755;
2756%macro IEMIMPL_FPU_R80_BY_I32 1
2757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2758 PROLOGUE_4_ARGS
2759 sub xSP, 20h
2760
2761 fninit
2762 fld tword [A2]
2763 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2764 %1 dword [A3]
2765
2766 fnstsw word [A1 + IEMFPURESULT.FSW]
2767 fnclex
2768 fstp tword [A1 + IEMFPURESULT.r80Result]
2769
2770 fninit
2771 add xSP, 20h
2772 EPILOGUE_4_ARGS
2773ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2774%endmacro
2775
2776IEMIMPL_FPU_R80_BY_I32 fiadd
2777IEMIMPL_FPU_R80_BY_I32 fimul
2778IEMIMPL_FPU_R80_BY_I32 fisub
2779IEMIMPL_FPU_R80_BY_I32 fisubr
2780IEMIMPL_FPU_R80_BY_I32 fidiv
2781IEMIMPL_FPU_R80_BY_I32 fidivr
2782
2783
2784;;
2785; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2786; only returning FSW.
2787;
2788; @param 1 The instruction
2789;
2790; @param A0 FPU context (fxsave).
2791; @param A1 Where to store the output FSW.
2792; @param A2 Pointer to the 80-bit value.
2793; @param A3 Pointer to the 64-bit value.
2794;
2795%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2796BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2797 PROLOGUE_4_ARGS
2798 sub xSP, 20h
2799
2800 fninit
2801 fld tword [A2]
2802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2803 %1 dword [A3]
2804
2805 fnstsw word [A1]
2806
2807 fninit
2808 add xSP, 20h
2809 EPILOGUE_4_ARGS
2810ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2811%endmacro
2812
2813IEMIMPL_FPU_R80_BY_I32_FSW ficom
2814
2815
2816
2817;
2818;---------------------- 64-bit signed integer operations ----------------------
2819;
2820
2821
2822;;
2823; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Pointer to a IEMFPURESULT for the output.
2827; @param A2 Pointer to the 64-bit floating point value to convert.
2828;
2829BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2830 PROLOGUE_3_ARGS
2831 sub xSP, 20h
2832
2833 fninit
2834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2835 fild qword [A2]
2836
2837 fnstsw word [A1 + IEMFPURESULT.FSW]
2838 fnclex
2839 fstp tword [A1 + IEMFPURESULT.r80Result]
2840
2841 fninit
2842 add xSP, 20h
2843 EPILOGUE_3_ARGS
2844ENDPROC iemAImpl_fild_r80_from_i64
2845
2846
2847;;
2848; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2849;
2850; @param A0 FPU context (fxsave).
2851; @param A1 Where to return the output FSW.
2852; @param A2 Where to store the 64-bit signed integer value.
2853; @param A3 Pointer to the 80-bit value.
2854;
2855BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2856 PROLOGUE_4_ARGS
2857 sub xSP, 20h
2858
2859 fninit
2860 fld tword [A3]
2861 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2862 fistp qword [A2]
2863
2864 fnstsw word [A1]
2865
2866 fninit
2867 add xSP, 20h
2868 EPILOGUE_4_ARGS
2869ENDPROC iemAImpl_fist_r80_to_i64
2870
2871
2872;;
2873; Store a 80-bit floating point value (register) as a 64-bit signed integer
2874; (memory) with truncation.
2875;
2876; @param A0 FPU context (fxsave).
2877; @param A1 Where to return the output FSW.
2878; @param A2 Where to store the 64-bit signed integer value.
2879; @param A3 Pointer to the 80-bit value.
2880;
2881BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2882 PROLOGUE_4_ARGS
2883 sub xSP, 20h
2884
2885 fninit
2886 fld tword [A3]
2887 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2888 fisttp qword [A2]
2889
2890 fnstsw word [A1]
2891
2892 fninit
2893 add xSP, 20h
2894 EPILOGUE_4_ARGS
2895ENDPROC iemAImpl_fistt_r80_to_i64
2896
2897
2898
2899;
2900;---------------------- 32-bit floating point operations ----------------------
2901;
2902
2903;;
2904; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2905;
2906; @param A0 FPU context (fxsave).
2907; @param A1 Pointer to a IEMFPURESULT for the output.
2908; @param A2 Pointer to the 32-bit floating point value to convert.
2909;
2910BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2911 PROLOGUE_3_ARGS
2912 sub xSP, 20h
2913
2914 fninit
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fld dword [A2]
2917
2918 fnstsw word [A1 + IEMFPURESULT.FSW]
2919 fnclex
2920 fstp tword [A1 + IEMFPURESULT.r80Result]
2921
2922 fninit
2923 add xSP, 20h
2924 EPILOGUE_3_ARGS
2925ENDPROC iemAImpl_fld_r80_from_r32
2926
2927
2928;;
2929; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Where to return the output FSW.
2933; @param A2 Where to store the 32-bit value.
2934; @param A3 Pointer to the 80-bit value.
2935;
2936BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2937 PROLOGUE_4_ARGS
2938 sub xSP, 20h
2939
2940 fninit
2941 fld tword [A3]
2942 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2943 fst dword [A2]
2944
2945 fnstsw word [A1]
2946
2947 fninit
2948 add xSP, 20h
2949 EPILOGUE_4_ARGS
2950ENDPROC iemAImpl_fst_r80_to_r32
2951
2952
2953;;
2954; FPU instruction working on one 80-bit and one 32-bit floating point value.
2955;
2956; @param 1 The instruction
2957;
2958; @param A0 FPU context (fxsave).
2959; @param A1 Pointer to a IEMFPURESULT for the output.
2960; @param A2 Pointer to the 80-bit value.
2961; @param A3 Pointer to the 32-bit value.
2962;
2963%macro IEMIMPL_FPU_R80_BY_R32 1
2964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2965 PROLOGUE_4_ARGS
2966 sub xSP, 20h
2967
2968 fninit
2969 fld tword [A2]
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 %1 dword [A3]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_4_ARGS
2980ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2981%endmacro
2982
2983IEMIMPL_FPU_R80_BY_R32 fadd
2984IEMIMPL_FPU_R80_BY_R32 fmul
2985IEMIMPL_FPU_R80_BY_R32 fsub
2986IEMIMPL_FPU_R80_BY_R32 fsubr
2987IEMIMPL_FPU_R80_BY_R32 fdiv
2988IEMIMPL_FPU_R80_BY_R32 fdivr
2989
2990
2991;;
2992; FPU instruction working on one 80-bit and one 32-bit floating point value,
2993; only returning FSW.
2994;
2995; @param 1 The instruction
2996;
2997; @param A0 FPU context (fxsave).
2998; @param A1 Where to store the output FSW.
2999; @param A2 Pointer to the 80-bit value.
3000; @param A3 Pointer to the 64-bit value.
3001;
3002%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3004 PROLOGUE_4_ARGS
3005 sub xSP, 20h
3006
3007 fninit
3008 fld tword [A2]
3009 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3010 %1 dword [A3]
3011
3012 fnstsw word [A1]
3013
3014 fninit
3015 add xSP, 20h
3016 EPILOGUE_4_ARGS
3017ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3018%endmacro
3019
3020IEMIMPL_FPU_R80_BY_R32_FSW fcom
3021
3022
3023
3024;
3025;---------------------- 64-bit floating point operations ----------------------
3026;
3027
3028;;
3029; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3030;
3031; @param A0 FPU context (fxsave).
3032; @param A1 Pointer to a IEMFPURESULT for the output.
3033; @param A2 Pointer to the 64-bit floating point value to convert.
3034;
3035BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3036 PROLOGUE_3_ARGS
3037 sub xSP, 20h
3038
3039 fninit
3040 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041 fld qword [A2]
3042
3043 fnstsw word [A1 + IEMFPURESULT.FSW]
3044 fnclex
3045 fstp tword [A1 + IEMFPURESULT.r80Result]
3046
3047 fninit
3048 add xSP, 20h
3049 EPILOGUE_3_ARGS
3050ENDPROC iemAImpl_fld_r80_from_r64
3051
3052
3053;;
3054; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3055;
3056; @param A0 FPU context (fxsave).
3057; @param A1 Where to return the output FSW.
3058; @param A2 Where to store the 64-bit value.
3059; @param A3 Pointer to the 80-bit value.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3062 PROLOGUE_4_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 fld tword [A3]
3067 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3068 fst qword [A2]
3069
3070 fnstsw word [A1]
3071
3072 fninit
3073 add xSP, 20h
3074 EPILOGUE_4_ARGS
3075ENDPROC iemAImpl_fst_r80_to_r64
3076
3077
3078;;
3079; FPU instruction working on one 80-bit and one 64-bit floating point value.
3080;
3081; @param 1 The instruction
3082;
3083; @param A0 FPU context (fxsave).
3084; @param A1 Pointer to a IEMFPURESULT for the output.
3085; @param A2 Pointer to the 80-bit value.
3086; @param A3 Pointer to the 64-bit value.
3087;
3088%macro IEMIMPL_FPU_R80_BY_R64 1
3089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3090 PROLOGUE_4_ARGS
3091 sub xSP, 20h
3092
3093 fninit
3094 fld tword [A2]
3095 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3096 %1 qword [A3]
3097
3098 fnstsw word [A1 + IEMFPURESULT.FSW]
3099 fnclex
3100 fstp tword [A1 + IEMFPURESULT.r80Result]
3101
3102 fninit
3103 add xSP, 20h
3104 EPILOGUE_4_ARGS
3105ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3106%endmacro
3107
3108IEMIMPL_FPU_R80_BY_R64 fadd
3109IEMIMPL_FPU_R80_BY_R64 fmul
3110IEMIMPL_FPU_R80_BY_R64 fsub
3111IEMIMPL_FPU_R80_BY_R64 fsubr
3112IEMIMPL_FPU_R80_BY_R64 fdiv
3113IEMIMPL_FPU_R80_BY_R64 fdivr
3114
3115;;
3116; FPU instruction working on one 80-bit and one 64-bit floating point value,
3117; only returning FSW.
3118;
3119; @param 1 The instruction
3120;
3121; @param A0 FPU context (fxsave).
3122; @param A1 Where to store the output FSW.
3123; @param A2 Pointer to the 80-bit value.
3124; @param A3 Pointer to the 64-bit value.
3125;
3126%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3127BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3128 PROLOGUE_4_ARGS
3129 sub xSP, 20h
3130
3131 fninit
3132 fld tword [A2]
3133 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3134 %1 qword [A3]
3135
3136 fnstsw word [A1]
3137
3138 fninit
3139 add xSP, 20h
3140 EPILOGUE_4_ARGS
3141ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3142%endmacro
3143
3144IEMIMPL_FPU_R80_BY_R64_FSW fcom
3145
3146
3147
3148;
3149;---------------------- 80-bit floating point operations ----------------------
3150;
3151
3152;;
3153; Loads a 80-bit floating point register value from memory.
3154;
3155; @param A0 FPU context (fxsave).
3156; @param A1 Pointer to a IEMFPURESULT for the output.
3157; @param A2 Pointer to the 80-bit floating point value to load.
3158;
3159BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3160 PROLOGUE_3_ARGS
3161 sub xSP, 20h
3162
3163 fninit
3164 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3165 fld tword [A2]
3166
3167 fnstsw word [A1 + IEMFPURESULT.FSW]
3168 fnclex
3169 fstp tword [A1 + IEMFPURESULT.r80Result]
3170
3171 fninit
3172 add xSP, 20h
3173 EPILOGUE_3_ARGS
3174ENDPROC iemAImpl_fld_r80_from_r80
3175
3176
3177;;
3178; Store a 80-bit floating point register to memory
3179;
3180; @param A0 FPU context (fxsave).
3181; @param A1 Where to return the output FSW.
3182; @param A2 Where to store the 80-bit value.
3183; @param A3 Pointer to the 80-bit register value.
3184;
3185BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3186 PROLOGUE_4_ARGS
3187 sub xSP, 20h
3188
3189 fninit
3190 fld tword [A3]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fstp tword [A2]
3193
3194 fnstsw word [A1]
3195
3196 fninit
3197 add xSP, 20h
3198 EPILOGUE_4_ARGS
3199ENDPROC iemAImpl_fst_r80_to_r80
3200
3201
3202;;
3203; Loads an 80-bit floating point register value in BCD format from memory.
3204;
3205; @param A0 FPU context (fxsave).
3206; @param A1 Pointer to a IEMFPURESULT for the output.
3207; @param A2 Pointer to the 80-bit BCD value to load.
3208;
3209BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3210 PROLOGUE_3_ARGS
3211 sub xSP, 20h
3212
3213 fninit
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fbld tword [A2]
3216
3217 fnstsw word [A1 + IEMFPURESULT.FSW]
3218 fnclex
3219 fstp tword [A1 + IEMFPURESULT.r80Result]
3220
3221 fninit
3222 add xSP, 20h
3223 EPILOGUE_3_ARGS
3224ENDPROC iemAImpl_fld_r80_from_d80
3225
3226
3227;;
3228; Store a 80-bit floating point register to memory as BCD
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Where to return the output FSW.
3232; @param A2 Where to store the 80-bit BCD value.
3233; @param A3 Pointer to the 80-bit register value.
3234;
3235BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3236 PROLOGUE_4_ARGS
3237 sub xSP, 20h
3238
3239 fninit
3240 fld tword [A3]
3241 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3242 fbstp tword [A2]
3243
3244 fnstsw word [A1]
3245
3246 fninit
3247 add xSP, 20h
3248 EPILOGUE_4_ARGS
3249ENDPROC iemAImpl_fst_r80_to_d80
3250
3251
3252;;
3253; FPU instruction working on two 80-bit floating point values.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Pointer to a IEMFPURESULT for the output.
3259; @param A2 Pointer to the first 80-bit value (ST0)
3260; @param A3 Pointer to the second 80-bit value (STn).
3261;
3262%macro IEMIMPL_FPU_R80_BY_R80 2
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A3]
3269 fld tword [A2]
3270 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3271 %1 %2
3272
3273 fnstsw word [A1 + IEMFPURESULT.FSW]
3274 fnclex
3275 fstp tword [A1 + IEMFPURESULT.r80Result]
3276
3277 fninit
3278 add xSP, 20h
3279 EPILOGUE_4_ARGS
3280ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3281%endmacro
3282
3283IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3284IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3285IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3286IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3287IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3288IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3289IEMIMPL_FPU_R80_BY_R80 fprem, {}
3290IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3291IEMIMPL_FPU_R80_BY_R80 fscale, {}
3292
3293
3294;;
3295; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3296; storing the result in ST1 and popping the stack.
3297;
3298; @param 1 The instruction
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the first 80-bit value (ST1).
3303; @param A3 Pointer to the second 80-bit value (ST0).
3304;
3305%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3307 PROLOGUE_4_ARGS
3308 sub xSP, 20h
3309
3310 fninit
3311 fld tword [A2]
3312 fld tword [A3]
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 %1
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_4_ARGS
3323ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3324%endmacro
3325
3326IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3327IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3328IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3329
3330
3331;;
3332; FPU instruction working on two 80-bit floating point values, only
3333; returning FSW.
3334;
3335; @param 1 The instruction
3336;
3337; @param A0 FPU context (fxsave).
3338; @param A1 Pointer to a uint16_t for the resulting FSW.
3339; @param A2 Pointer to the first 80-bit value.
3340; @param A3 Pointer to the second 80-bit value.
3341;
3342%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3344 PROLOGUE_4_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 fld tword [A3]
3349 fld tword [A2]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 %1 st0, st1
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_FSW fcom
3362IEMIMPL_FPU_R80_BY_R80_FSW fucom
3363
3364
3365;;
3366; FPU instruction working on two 80-bit floating point values,
3367; returning FSW and EFLAGS (eax).
3368;
3369; @param 1 The instruction
3370;
3371; @returns EFLAGS in EAX.
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st1
3387
3388 fnstsw word [A1]
3389 pushf
3390 pop xAX
3391
3392 fninit
3393 add xSP, 20h
3394 EPILOGUE_4_ARGS
3395ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3396%endmacro
3397
3398IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3399IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3400
3401
3402;;
3403; FPU instruction working on one 80-bit floating point value.
3404;
3405; @param 1 The instruction
3406;
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a IEMFPURESULT for the output.
3409; @param A2 Pointer to the 80-bit value.
3410;
3411%macro IEMIMPL_FPU_R80 1
3412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3413 PROLOGUE_3_ARGS
3414 sub xSP, 20h
3415
3416 fninit
3417 fld tword [A2]
3418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3419 %1
3420
3421 fnstsw word [A1 + IEMFPURESULT.FSW]
3422 fnclex
3423 fstp tword [A1 + IEMFPURESULT.r80Result]
3424
3425 fninit
3426 add xSP, 20h
3427 EPILOGUE_3_ARGS
3428ENDPROC iemAImpl_ %+ %1 %+ _r80
3429%endmacro
3430
3431IEMIMPL_FPU_R80 fchs
3432IEMIMPL_FPU_R80 fabs
3433IEMIMPL_FPU_R80 f2xm1
3434IEMIMPL_FPU_R80 fsqrt
3435IEMIMPL_FPU_R80 frndint
3436IEMIMPL_FPU_R80 fsin
3437IEMIMPL_FPU_R80 fcos
3438
3439
3440;;
3441; FPU instruction working on one 80-bit floating point value, only
3442; returning FSW.
3443;
3444; @param 1 The instruction
3445; @param 2 Non-zero to also restore FTW.
3446;
3447; @param A0 FPU context (fxsave).
3448; @param A1 Pointer to a uint16_t for the resulting FSW.
3449; @param A2 Pointer to the 80-bit value.
3450;
3451%macro IEMIMPL_FPU_R80_FSW 2
3452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3453 PROLOGUE_3_ARGS
3454 sub xSP, 20h
3455
3456 fninit
3457 fld tword [A2]
3458%if %2 != 0
3459 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3460%else
3461 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3462%endif
3463 %1
3464
3465 fnstsw word [A1]
3466
3467 fninit
3468 add xSP, 20h
3469 EPILOGUE_3_ARGS
3470ENDPROC iemAImpl_ %+ %1 %+ _r80
3471%endmacro
3472
3473IEMIMPL_FPU_R80_FSW ftst, 0
3474IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3475
3476
3477
3478;;
3479; FPU instruction loading a 80-bit floating point constant.
3480;
3481; @param 1 The instruction
3482;
3483; @param A0 FPU context (fxsave).
3484; @param A1 Pointer to a IEMFPURESULT for the output.
3485;
3486%macro IEMIMPL_FPU_R80_CONST 1
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3488 PROLOGUE_2_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1
3494
3495 fnstsw word [A1 + IEMFPURESULT.FSW]
3496 fnclex
3497 fstp tword [A1 + IEMFPURESULT.r80Result]
3498
3499 fninit
3500 add xSP, 20h
3501 EPILOGUE_2_ARGS
3502ENDPROC iemAImpl_ %+ %1 %+
3503%endmacro
3504
3505IEMIMPL_FPU_R80_CONST fld1
3506IEMIMPL_FPU_R80_CONST fldl2t
3507IEMIMPL_FPU_R80_CONST fldl2e
3508IEMIMPL_FPU_R80_CONST fldpi
3509IEMIMPL_FPU_R80_CONST fldlg2
3510IEMIMPL_FPU_R80_CONST fldln2
3511IEMIMPL_FPU_R80_CONST fldz
3512
3513
3514;;
3515; FPU instruction working on one 80-bit floating point value, outputing two.
3516;
3517; @param 1 The instruction
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3521; @param A2 Pointer to the 80-bit value.
3522;
3523%macro IEMIMPL_FPU_R80_R80 1
3524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3525 PROLOGUE_3_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1
3532
3533 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3534 fnclex
3535 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3536 fnclex
3537 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3538
3539 fninit
3540 add xSP, 20h
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3543%endmacro
3544
3545IEMIMPL_FPU_R80_R80 fptan
3546IEMIMPL_FPU_R80_R80 fxtract
3547IEMIMPL_FPU_R80_R80 fsincos
3548
3549
3550
3551
3552;---------------------- SSE and MMX Operations ----------------------
3553
3554;; @todo what do we need to do for MMX?
3555%macro IEMIMPL_MMX_PROLOGUE 0
3556%endmacro
3557%macro IEMIMPL_MMX_EPILOGUE 0
3558%endmacro
3559
3560;; @todo what do we need to do for SSE?
3561%macro IEMIMPL_SSE_PROLOGUE 0
3562%endmacro
3563%macro IEMIMPL_SSE_EPILOGUE 0
3564%endmacro
3565
3566;; @todo what do we need to do for AVX?
3567%macro IEMIMPL_AVX_PROLOGUE 0
3568%endmacro
3569%macro IEMIMPL_AVX_EPILOGUE 0
3570%endmacro
3571
3572
3573;;
3574; Media instruction working on two full sized registers.
3575;
3576; @param 1 The instruction
3577; @param 2 Whether there is an MMX variant (1) or not (0).
3578;
3579; @param A0 FPU context (fxsave).
3580; @param A1 Pointer to the first media register size operand (input/output).
3581; @param A2 Pointer to the second media register size operand (input).
3582;
3583%macro IEMIMPL_MEDIA_F2 2
3584%if %2 != 0
3585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3586 PROLOGUE_3_ARGS
3587 IEMIMPL_MMX_PROLOGUE
3588
3589 movq mm0, [A1]
3590 movq mm1, [A2]
3591 %1 mm0, mm1
3592 movq [A1], mm0
3593
3594 IEMIMPL_MMX_EPILOGUE
3595 EPILOGUE_3_ARGS
3596ENDPROC iemAImpl_ %+ %1 %+ _u64
3597%endif
3598
3599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3600 PROLOGUE_3_ARGS
3601 IEMIMPL_SSE_PROLOGUE
3602
3603 movdqu xmm0, [A1]
3604 movdqu xmm1, [A2]
3605 %1 xmm0, xmm1
3606 movdqu [A1], xmm0
3607
3608 IEMIMPL_SSE_EPILOGUE
3609 EPILOGUE_3_ARGS
3610ENDPROC iemAImpl_ %+ %1 %+ _u128
3611%endmacro
3612
3613IEMIMPL_MEDIA_F2 pshufb, 1
3614IEMIMPL_MEDIA_F2 pand, 1
3615IEMIMPL_MEDIA_F2 pandn, 1
3616IEMIMPL_MEDIA_F2 por, 1
3617IEMIMPL_MEDIA_F2 pxor, 1
3618IEMIMPL_MEDIA_F2 pcmpeqb, 1
3619IEMIMPL_MEDIA_F2 pcmpeqw, 1
3620IEMIMPL_MEDIA_F2 pcmpeqd, 1
3621IEMIMPL_MEDIA_F2 pcmpeqq, 0
3622IEMIMPL_MEDIA_F2 pcmpgtb, 1
3623IEMIMPL_MEDIA_F2 pcmpgtw, 1
3624IEMIMPL_MEDIA_F2 pcmpgtd, 1
3625IEMIMPL_MEDIA_F2 pcmpgtq, 0
3626IEMIMPL_MEDIA_F2 paddb, 1
3627IEMIMPL_MEDIA_F2 paddw, 1
3628IEMIMPL_MEDIA_F2 paddd, 1
3629IEMIMPL_MEDIA_F2 paddq, 1
3630IEMIMPL_MEDIA_F2 paddsb, 1
3631IEMIMPL_MEDIA_F2 paddsw, 1
3632IEMIMPL_MEDIA_F2 paddusb, 1
3633IEMIMPL_MEDIA_F2 paddusw, 1
3634IEMIMPL_MEDIA_F2 psubb, 1
3635IEMIMPL_MEDIA_F2 psubw, 1
3636IEMIMPL_MEDIA_F2 psubd, 1
3637IEMIMPL_MEDIA_F2 psubq, 1
3638IEMIMPL_MEDIA_F2 psubsb, 1
3639IEMIMPL_MEDIA_F2 psubsw, 1
3640IEMIMPL_MEDIA_F2 psubusb, 1
3641IEMIMPL_MEDIA_F2 psubusw, 1
3642IEMIMPL_MEDIA_F2 pmullw, 1
3643IEMIMPL_MEDIA_F2 pmulld, 0
3644IEMIMPL_MEDIA_F2 pmulhw, 1
3645IEMIMPL_MEDIA_F2 pmaddwd, 1
3646IEMIMPL_MEDIA_F2 pminub, 1
3647IEMIMPL_MEDIA_F2 pminuw, 0
3648IEMIMPL_MEDIA_F2 pminud, 0
3649IEMIMPL_MEDIA_F2 pminsb, 0
3650IEMIMPL_MEDIA_F2 pminsw, 1
3651IEMIMPL_MEDIA_F2 pminsd, 0
3652IEMIMPL_MEDIA_F2 pmaxub, 1
3653IEMIMPL_MEDIA_F2 pmaxuw, 0
3654IEMIMPL_MEDIA_F2 pmaxud, 0
3655IEMIMPL_MEDIA_F2 pmaxsb, 0
3656IEMIMPL_MEDIA_F2 pmaxsw, 1
3657IEMIMPL_MEDIA_F2 pmaxsd, 0
3658IEMIMPL_MEDIA_F2 pabsb, 1
3659IEMIMPL_MEDIA_F2 pabsw, 1
3660IEMIMPL_MEDIA_F2 pabsd, 1
3661IEMIMPL_MEDIA_F2 psignb, 1
3662IEMIMPL_MEDIA_F2 psignw, 1
3663IEMIMPL_MEDIA_F2 psignd, 1
3664IEMIMPL_MEDIA_F2 phaddw, 1
3665IEMIMPL_MEDIA_F2 phaddd, 1
3666IEMIMPL_MEDIA_F2 phsubw, 1
3667IEMIMPL_MEDIA_F2 phsubd, 1
3668IEMIMPL_MEDIA_F2 phaddsw, 1
3669IEMIMPL_MEDIA_F2 phsubsw, 1
3670IEMIMPL_MEDIA_F2 pmaddubsw, 1
3671IEMIMPL_MEDIA_F2 pmulhrsw, 1
3672IEMIMPL_MEDIA_F2 pmuludq, 1
3673
3674
3675;;
3676; Media instruction working on two full sized registers, but no FXSAVE state argument.
3677;
3678; @param 1 The instruction
3679; @param 2 Whether there is an MMX variant (1) or not (0).
3680;
3681; @param A0 Pointer to the first media register size operand (input/output).
3682; @param A1 Pointer to the second media register size operand (input).
3683;
3684%macro IEMIMPL_MEDIA_OPT_F2 2
3685%if %2 != 0
3686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3687 PROLOGUE_2_ARGS
3688 IEMIMPL_MMX_PROLOGUE
3689
3690 movq mm0, [A0]
3691 movq mm1, [A1]
3692 %1 mm0, mm1
3693 movq [A0], mm0
3694
3695 IEMIMPL_MMX_EPILOGUE
3696 EPILOGUE_2_ARGS
3697ENDPROC iemAImpl_ %+ %1 %+ _u64
3698%endif
3699
3700BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3701 PROLOGUE_2_ARGS
3702 IEMIMPL_SSE_PROLOGUE
3703
3704 movdqu xmm0, [A0]
3705 movdqu xmm1, [A1]
3706 %1 xmm0, xmm1
3707 movdqu [A0], xmm0
3708
3709 IEMIMPL_SSE_EPILOGUE
3710 EPILOGUE_2_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _u128
3712%endmacro
3713
3714IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3715IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3716IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3717IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3718IEMIMPL_MEDIA_OPT_F2 psllw, 1
3719IEMIMPL_MEDIA_OPT_F2 pslld, 1
3720IEMIMPL_MEDIA_OPT_F2 psllq, 1
3721IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3722IEMIMPL_MEDIA_OPT_F2 psrld, 1
3723IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3724IEMIMPL_MEDIA_OPT_F2 psraw, 1
3725IEMIMPL_MEDIA_OPT_F2 psrad, 1
3726IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3727IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3728IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3729IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3730IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3731IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3732IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3733IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3734IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3735IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3736IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3737IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3738IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3739IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3740IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3741IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3742IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3743IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3744IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3745IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3746
3747;;
3748; Media instruction working on one full sized and one half sized register (lower half).
3749;
3750; @param 1 The instruction
3751; @param 2 1 if MMX is included, 0 if not.
3752;
3753; @param A0 Pointer to the first full sized media register operand (input/output).
3754; @param A1 Pointer to the second half sized media register operand (input).
3755;
3756%macro IEMIMPL_MEDIA_F1L1 2
3757 %if %2 != 0
3758BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3759 PROLOGUE_2_ARGS
3760 IEMIMPL_MMX_PROLOGUE
3761
3762 movq mm0, [A0]
3763 movq mm1, [A1]
3764 %1 mm0, mm1
3765 movq [A0], mm0
3766
3767 IEMIMPL_MMX_EPILOGUE
3768 EPILOGUE_2_ARGS
3769ENDPROC iemAImpl_ %+ %1 %+ _u64
3770 %endif
3771
3772BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3773 PROLOGUE_2_ARGS
3774 IEMIMPL_SSE_PROLOGUE
3775
3776 movdqu xmm0, [A0]
3777 movdqu xmm1, [A1]
3778 %1 xmm0, xmm1
3779 movdqu [A0], xmm0
3780
3781 IEMIMPL_SSE_EPILOGUE
3782 EPILOGUE_2_ARGS
3783ENDPROC iemAImpl_ %+ %1 %+ _u128
3784%endmacro
3785
3786IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3787IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3788IEMIMPL_MEDIA_F1L1 punpckldq, 1
3789IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3790
3791
3792;;
3793; Media instruction working two half sized input registers (lower half) and a full sized
3794; destination register (vpunpckh*).
3795;
3796; @param 1 The instruction
3797;
3798; @param A0 Pointer to the destination register (full sized, output only).
3799; @param A1 Pointer to the first full sized media source register operand, where we
3800; will only use the lower half as input - but we'll be loading it in full.
3801; @param A2 Pointer to the second full sized media source register operand, where we
3802; will only use the lower half as input - but we'll be loading it in full.
3803;
3804%macro IEMIMPL_MEDIA_F1L1L1 1
3805BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3806 PROLOGUE_3_ARGS
3807 IEMIMPL_AVX_PROLOGUE
3808
3809 vmovdqu xmm0, [A1]
3810 vmovdqu xmm1, [A2]
3811 %1 xmm0, xmm0, xmm1
3812 vmovdqu [A0], xmm0
3813
3814 IEMIMPL_AVX_PROLOGUE
3815 EPILOGUE_3_ARGS
3816ENDPROC iemAImpl_ %+ %1 %+ _u128
3817
3818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3819 PROLOGUE_3_ARGS
3820 IEMIMPL_AVX_PROLOGUE
3821
3822 vmovdqu ymm0, [A1]
3823 vmovdqu ymm1, [A2]
3824 %1 ymm0, ymm0, ymm1
3825 vmovdqu [A0], ymm0
3826
3827 IEMIMPL_AVX_PROLOGUE
3828 EPILOGUE_3_ARGS
3829ENDPROC iemAImpl_ %+ %1 %+ _u256
3830%endmacro
3831
3832IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3833IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3834IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3835IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3836
3837
3838;;
3839; Media instruction working on one full sized and one half sized register (high half).
3840;
3841; @param 1 The instruction
3842; @param 2 1 if MMX is included, 0 if not.
3843;
3844; @param A0 Pointer to the first full sized media register operand (input/output).
3845; @param A1 Pointer to the second full sized media register operand, where we
3846; will only use the upper half as input - but we'll load it in full.
3847;
3848%macro IEMIMPL_MEDIA_F1H1 2
3849IEMIMPL_MEDIA_F1L1 %1, %2
3850%endmacro
3851
3852IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3853IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3854IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3855IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3856
3857
3858;;
3859; Media instruction working two half sized input registers (high half) and a full sized
3860; destination register (vpunpckh*).
3861;
3862; @param 1 The instruction
3863;
3864; @param A0 Pointer to the destination register (full sized, output only).
3865; @param A1 Pointer to the first full sized media source register operand, where we
3866; will only use the upper half as input - but we'll be loading it in full.
3867; @param A2 Pointer to the second full sized media source register operand, where we
3868; will only use the upper half as input - but we'll be loading it in full.
3869;
3870%macro IEMIMPL_MEDIA_F1H1H1 1
3871IEMIMPL_MEDIA_F1L1L1 %1
3872%endmacro
3873
3874IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3875IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3876IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3877IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3878
3879
3880;
3881; Shufflers with evil 8-bit immediates.
3882;
3883
3884BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3885 PROLOGUE_3_ARGS
3886 IEMIMPL_MMX_PROLOGUE
3887
3888 movq mm1, [A1]
3889 movq mm0, mm0 ; paranoia!
3890 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3891 lea T1, [.imm0 xWrtRIP]
3892 lea T1, [T1 + T0]
3893 call T1
3894 movq [A0], mm0
3895
3896 IEMIMPL_MMX_EPILOGUE
3897 EPILOGUE_3_ARGS
3898%assign bImm 0
3899%rep 256
3900.imm %+ bImm:
3901 pshufw mm0, mm1, bImm
3902 ret
3903 %assign bImm bImm + 1
3904%endrep
3905.immEnd: ; 256*5 == 0x500
3906dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3907dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3908ENDPROC iemAImpl_pshufw_u64
3909
3910
3911%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3913 PROLOGUE_3_ARGS
3914 IEMIMPL_SSE_PROLOGUE
3915
3916 movdqu xmm1, [A1]
3917 movdqu xmm0, xmm1 ; paranoia!
3918 lea T1, [.imm0 xWrtRIP]
3919 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3920 lea T1, [T1 + T0*2]
3921 call T1
3922 movdqu [A0], xmm0
3923
3924 IEMIMPL_SSE_EPILOGUE
3925 EPILOGUE_3_ARGS
3926 %assign bImm 0
3927 %rep 256
3928.imm %+ bImm:
3929 %1 xmm0, xmm1, bImm
3930 ret
3931 %assign bImm bImm + 1
3932 %endrep
3933.immEnd: ; 256*6 == 0x600
3934dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3935dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3936ENDPROC iemAImpl_ %+ %1 %+ _u128
3937%endmacro
3938
3939IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3940IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3941IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3942
3943
3944%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3945BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3946 PROLOGUE_3_ARGS
3947 IEMIMPL_SSE_PROLOGUE
3948
3949 vmovdqu ymm1, [A1]
3950 vmovdqu ymm0, ymm1 ; paranoia!
3951 lea T1, [.imm0 xWrtRIP]
3952 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3953 lea T1, [T1 + T0*2]
3954 call T1
3955 vmovdqu [A0], ymm0
3956
3957 IEMIMPL_SSE_EPILOGUE
3958 EPILOGUE_3_ARGS
3959 %assign bImm 0
3960 %rep 256
3961.imm %+ bImm:
3962 %1 ymm0, ymm1, bImm
3963 ret
3964 %assign bImm bImm + 1
3965 %endrep
3966.immEnd: ; 256*6 == 0x600
3967dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3968dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3969ENDPROC iemAImpl_ %+ %1 %+ _u256
3970%endmacro
3971
3972IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3973IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3974IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3975
3976
3977;
3978; Shifts with evil 8-bit immediates.
3979;
3980
3981%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3982BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3983 PROLOGUE_2_ARGS
3984 IEMIMPL_MMX_PROLOGUE
3985
3986 movq mm0, [A0]
3987 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3988 lea T1, [.imm0 xWrtRIP]
3989 lea T1, [T1 + T0]
3990 call T1
3991 movq [A0], mm0
3992
3993 IEMIMPL_MMX_EPILOGUE
3994 EPILOGUE_2_ARGS
3995%assign bImm 0
3996%rep 256
3997.imm %+ bImm:
3998 %1 mm0, bImm
3999 ret
4000 %assign bImm bImm + 1
4001%endrep
4002.immEnd: ; 256*5 == 0x500
4003dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4004dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4005ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4006%endmacro
4007
4008IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4009IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4010IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4011IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4012IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4013IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4014IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4015IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4016
4017
4018%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4019BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4020 PROLOGUE_2_ARGS
4021 IEMIMPL_SSE_PROLOGUE
4022
4023 movdqu xmm0, [A0]
4024 lea T1, [.imm0 xWrtRIP]
4025 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4026 lea T1, [T1 + T0*2]
4027 call T1
4028 movdqu [A0], xmm0
4029
4030 IEMIMPL_SSE_EPILOGUE
4031 EPILOGUE_2_ARGS
4032 %assign bImm 0
4033 %rep 256
4034.imm %+ bImm:
4035 %1 xmm0, bImm
4036 ret
4037 %assign bImm bImm + 1
4038 %endrep
4039.immEnd: ; 256*6 == 0x600
4040dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4041dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4042ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4043%endmacro
4044
4045IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4046IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4047IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4048IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4049IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4050IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4051IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4052IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4053IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4054IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4055
4056
4057;
4058; Move byte mask.
4059;
4060
4061BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4062 PROLOGUE_2_ARGS
4063 IEMIMPL_MMX_PROLOGUE
4064
4065 movq mm1, [A1]
4066 pmovmskb T0, mm1
4067 mov [A0], T0
4068%ifdef RT_ARCH_X86
4069 mov dword [A0 + 4], 0
4070%endif
4071 IEMIMPL_MMX_EPILOGUE
4072 EPILOGUE_2_ARGS
4073ENDPROC iemAImpl_pmovmskb_u64
4074
4075BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4076 PROLOGUE_2_ARGS
4077 IEMIMPL_SSE_PROLOGUE
4078
4079 movdqu xmm1, [A1]
4080 pmovmskb T0, xmm1
4081 mov [A0], T0
4082%ifdef RT_ARCH_X86
4083 mov dword [A0 + 4], 0
4084%endif
4085 IEMIMPL_SSE_EPILOGUE
4086 EPILOGUE_2_ARGS
4087ENDPROC iemAImpl_pmovmskb_u128
4088
4089BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4090 PROLOGUE_2_ARGS
4091 IEMIMPL_AVX_PROLOGUE
4092
4093 vmovdqu ymm1, [A1]
4094 vpmovmskb T0, ymm1
4095 mov [A0], T0
4096%ifdef RT_ARCH_X86
4097 mov dword [A0 + 4], 0
4098%endif
4099 IEMIMPL_AVX_EPILOGUE
4100 EPILOGUE_2_ARGS
4101ENDPROC iemAImpl_vpmovmskb_u256
4102
4103
4104;;
4105; Media instruction working on two full sized source registers and one destination (AVX).
4106;
4107; @param 1 The instruction
4108;
4109; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4110; @param A1 Pointer to the destination media register size operand (output).
4111; @param A2 Pointer to the first source media register size operand (input).
4112; @param A3 Pointer to the second source media register size operand (input).
4113;
4114%macro IEMIMPL_MEDIA_F3 1
4115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4116 PROLOGUE_4_ARGS
4117 IEMIMPL_AVX_PROLOGUE
4118
4119 vmovdqu xmm0, [A2]
4120 vmovdqu xmm1, [A3]
4121 %1 xmm0, xmm0, xmm1
4122 vmovdqu [A1], xmm0
4123
4124 IEMIMPL_AVX_PROLOGUE
4125 EPILOGUE_4_ARGS
4126ENDPROC iemAImpl_ %+ %1 %+ _u128
4127
4128BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4129 PROLOGUE_4_ARGS
4130 IEMIMPL_AVX_PROLOGUE
4131
4132 vmovdqu ymm0, [A2]
4133 vmovdqu ymm1, [A3]
4134 %1 ymm0, ymm0, ymm1
4135 vmovdqu [A1], ymm0
4136
4137 IEMIMPL_AVX_PROLOGUE
4138 EPILOGUE_4_ARGS
4139ENDPROC iemAImpl_ %+ %1 %+ _u256
4140%endmacro
4141
4142IEMIMPL_MEDIA_F3 vpshufb
4143IEMIMPL_MEDIA_F3 vpand
4144IEMIMPL_MEDIA_F3 vpminub
4145IEMIMPL_MEDIA_F3 vpminuw
4146IEMIMPL_MEDIA_F3 vpminud
4147IEMIMPL_MEDIA_F3 vpminsb
4148IEMIMPL_MEDIA_F3 vpminsw
4149IEMIMPL_MEDIA_F3 vpminsd
4150IEMIMPL_MEDIA_F3 vpmaxub
4151IEMIMPL_MEDIA_F3 vpmaxuw
4152IEMIMPL_MEDIA_F3 vpmaxud
4153IEMIMPL_MEDIA_F3 vpmaxsb
4154IEMIMPL_MEDIA_F3 vpmaxsw
4155IEMIMPL_MEDIA_F3 vpmaxsd
4156IEMIMPL_MEDIA_F3 vpandn
4157IEMIMPL_MEDIA_F3 vpor
4158IEMIMPL_MEDIA_F3 vpxor
4159IEMIMPL_MEDIA_F3 vpcmpeqb
4160IEMIMPL_MEDIA_F3 vpcmpeqw
4161IEMIMPL_MEDIA_F3 vpcmpeqd
4162IEMIMPL_MEDIA_F3 vpcmpeqq
4163IEMIMPL_MEDIA_F3 vpcmpgtb
4164IEMIMPL_MEDIA_F3 vpcmpgtw
4165IEMIMPL_MEDIA_F3 vpcmpgtd
4166IEMIMPL_MEDIA_F3 vpcmpgtq
4167IEMIMPL_MEDIA_F3 vpaddb
4168IEMIMPL_MEDIA_F3 vpaddw
4169IEMIMPL_MEDIA_F3 vpaddd
4170IEMIMPL_MEDIA_F3 vpaddq
4171IEMIMPL_MEDIA_F3 vpsubb
4172IEMIMPL_MEDIA_F3 vpsubw
4173IEMIMPL_MEDIA_F3 vpsubd
4174IEMIMPL_MEDIA_F3 vpsubq
4175
4176
4177;;
4178; Media instruction working on two full sized source registers and one destination (AVX),
4179; but no XSAVE state pointer argument.
4180;
4181; @param 1 The instruction
4182;
4183; @param A0 Pointer to the destination media register size operand (output).
4184; @param A1 Pointer to the first source media register size operand (input).
4185; @param A2 Pointer to the second source media register size operand (input).
4186;
4187%macro IEMIMPL_MEDIA_OPT_F3 1
4188BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4189 PROLOGUE_3_ARGS
4190 IEMIMPL_AVX_PROLOGUE
4191
4192 vmovdqu xmm0, [A1]
4193 vmovdqu xmm1, [A2]
4194 %1 xmm0, xmm0, xmm1
4195 vmovdqu [A0], xmm0
4196
4197 IEMIMPL_AVX_PROLOGUE
4198 EPILOGUE_3_ARGS
4199ENDPROC iemAImpl_ %+ %1 %+ _u128
4200
4201BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4202 PROLOGUE_3_ARGS
4203 IEMIMPL_AVX_PROLOGUE
4204
4205 vmovdqu ymm0, [A1]
4206 vmovdqu ymm1, [A2]
4207 %1 ymm0, ymm0, ymm1
4208 vmovdqu [A0], ymm0
4209
4210 IEMIMPL_AVX_PROLOGUE
4211 EPILOGUE_3_ARGS
4212ENDPROC iemAImpl_ %+ %1 %+ _u256
4213%endmacro
4214
4215IEMIMPL_MEDIA_OPT_F3 vpacksswb
4216IEMIMPL_MEDIA_OPT_F3 vpackssdw
4217IEMIMPL_MEDIA_OPT_F3 vpackuswb
4218IEMIMPL_MEDIA_OPT_F3 vpackusdw
4219IEMIMPL_MEDIA_OPT_F3 vpmullw
4220IEMIMPL_MEDIA_OPT_F3 vpmulld
4221IEMIMPL_MEDIA_OPT_F3 vpmulhw
4222IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4223IEMIMPL_MEDIA_OPT_F3 vpavgb
4224IEMIMPL_MEDIA_OPT_F3 vpavgw
4225IEMIMPL_MEDIA_OPT_F3 vpsignb
4226IEMIMPL_MEDIA_OPT_F3 vpsignw
4227IEMIMPL_MEDIA_OPT_F3 vpsignd
4228IEMIMPL_MEDIA_OPT_F3 vphaddw
4229IEMIMPL_MEDIA_OPT_F3 vphaddd
4230IEMIMPL_MEDIA_OPT_F3 vphsubw
4231IEMIMPL_MEDIA_OPT_F3 vphsubd
4232IEMIMPL_MEDIA_OPT_F3 vphaddsw
4233IEMIMPL_MEDIA_OPT_F3 vphsubsw
4234IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4235IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4236IEMIMPL_MEDIA_OPT_F3 vpsadbw
4237IEMIMPL_MEDIA_OPT_F3 vpmuldq
4238IEMIMPL_MEDIA_OPT_F3 vpmuludq
4239IEMIMPL_MEDIA_OPT_F3 vunpcklps
4240IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4241IEMIMPL_MEDIA_OPT_F3 vunpckhps
4242IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4243
4244;;
4245; Media instruction working on one full sized source registers and one destination (AVX),
4246; but no XSAVE state pointer argument.
4247;
4248; @param 1 The instruction
4249; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4250;
4251; @param A0 Pointer to the destination media register size operand (output).
4252; @param A1 Pointer to the source media register size operand (input).
4253;
4254%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4255BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4256 PROLOGUE_2_ARGS
4257 IEMIMPL_AVX_PROLOGUE
4258
4259 vmovdqu xmm0, [A1]
4260 %1 xmm0, xmm0
4261 vmovdqu [A0], xmm0
4262
4263 IEMIMPL_AVX_PROLOGUE
4264 EPILOGUE_2_ARGS
4265ENDPROC iemAImpl_ %+ %1 %+ _u128
4266
4267 %if %2 == 1
4268BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4269 PROLOGUE_2_ARGS
4270 IEMIMPL_AVX_PROLOGUE
4271
4272 vmovdqu ymm0, [A1]
4273 %1 ymm0, ymm0
4274 vmovdqu [A0], ymm0
4275
4276 IEMIMPL_AVX_PROLOGUE
4277 EPILOGUE_2_ARGS
4278ENDPROC iemAImpl_ %+ %1 %+ _u256
4279 %endif
4280%endmacro
4281
4282IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4283IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4284IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4285IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4286
4287
4288;
4289; The SSE 4.2 crc32
4290;
4291; @param A1 Pointer to the 32-bit destination.
4292; @param A2 The source operand, sized according to the suffix.
4293;
4294BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4295 PROLOGUE_2_ARGS
4296
4297 mov T0_32, [A0]
4298 crc32 T0_32, A1_8
4299 mov [A0], T0_32
4300
4301 EPILOGUE_2_ARGS
4302ENDPROC iemAImpl_crc32_u8
4303
4304BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4305 PROLOGUE_2_ARGS
4306
4307 mov T0_32, [A0]
4308 crc32 T0_32, A1_16
4309 mov [A0], T0_32
4310
4311 EPILOGUE_2_ARGS
4312ENDPROC iemAImpl_crc32_u16
4313
4314BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4315 PROLOGUE_2_ARGS
4316
4317 mov T0_32, [A0]
4318 crc32 T0_32, A1_32
4319 mov [A0], T0_32
4320
4321 EPILOGUE_2_ARGS
4322ENDPROC iemAImpl_crc32_u32
4323
4324%ifdef RT_ARCH_AMD64
4325BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4326 PROLOGUE_2_ARGS
4327
4328 mov T0_32, [A0]
4329 crc32 T0, A1
4330 mov [A0], T0_32
4331
4332 EPILOGUE_2_ARGS
4333ENDPROC iemAImpl_crc32_u64
4334%endif
4335
4336
4337;
4338; PTEST (SSE 4.1)
4339;
4340; @param A0 Pointer to the first source operand (aka readonly destination).
4341; @param A1 Pointer to the second source operand.
4342; @param A2 Pointer to the EFLAGS register.
4343;
4344BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4345 PROLOGUE_3_ARGS
4346 IEMIMPL_SSE_PROLOGUE
4347
4348 movdqu xmm0, [A0]
4349 movdqu xmm1, [A1]
4350 ptest xmm0, xmm1
4351 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4352
4353 IEMIMPL_SSE_EPILOGUE
4354 EPILOGUE_3_ARGS
4355ENDPROC iemAImpl_ptest_u128
4356
4357BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4358 PROLOGUE_3_ARGS
4359 IEMIMPL_SSE_PROLOGUE
4360
4361 vmovdqu ymm0, [A0]
4362 vmovdqu ymm1, [A1]
4363 vptest ymm0, ymm1
4364 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4365
4366 IEMIMPL_SSE_EPILOGUE
4367 EPILOGUE_3_ARGS
4368ENDPROC iemAImpl_vptest_u256
4369
4370
4371;;
4372; Template for the [v]pmov{s,z}x* instructions
4373;
4374; @param 1 The instruction
4375;
4376; @param A0 Pointer to the destination media register size operand (output).
4377; @param A1 The source operand value (input).
4378;
4379%macro IEMIMPL_V_PMOV_SZ_X 1
4380BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4381 PROLOGUE_2_ARGS
4382 IEMIMPL_SSE_PROLOGUE
4383
4384 movd xmm0, A1
4385 %1 xmm0, xmm0
4386 vmovdqu [A0], xmm0
4387
4388 IEMIMPL_SSE_PROLOGUE
4389 EPILOGUE_2_ARGS
4390ENDPROC iemAImpl_ %+ %1 %+ _u128
4391
4392BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4393 PROLOGUE_2_ARGS
4394 IEMIMPL_AVX_PROLOGUE
4395
4396 movd xmm0, A1
4397 v %+ %1 xmm0, xmm0
4398 vmovdqu [A0], xmm0
4399
4400 IEMIMPL_AVX_PROLOGUE
4401 EPILOGUE_2_ARGS
4402ENDPROC iemAImpl_v %+ %1 %+ _u128
4403
4404BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4405 PROLOGUE_2_ARGS
4406 IEMIMPL_AVX_PROLOGUE
4407
4408 movdqu xmm0, [A1]
4409 v %+ %1 ymm0, xmm0
4410 vmovdqu [A0], ymm0
4411
4412 IEMIMPL_AVX_PROLOGUE
4413 EPILOGUE_2_ARGS
4414ENDPROC iemAImpl_v %+ %1 %+ _u256
4415%endmacro
4416
4417IEMIMPL_V_PMOV_SZ_X pmovsxbw
4418IEMIMPL_V_PMOV_SZ_X pmovsxbd
4419IEMIMPL_V_PMOV_SZ_X pmovsxbq
4420IEMIMPL_V_PMOV_SZ_X pmovsxwd
4421IEMIMPL_V_PMOV_SZ_X pmovsxwq
4422IEMIMPL_V_PMOV_SZ_X pmovsxdq
4423
4424IEMIMPL_V_PMOV_SZ_X pmovzxbw
4425IEMIMPL_V_PMOV_SZ_X pmovzxbd
4426IEMIMPL_V_PMOV_SZ_X pmovzxbq
4427IEMIMPL_V_PMOV_SZ_X pmovzxwd
4428IEMIMPL_V_PMOV_SZ_X pmovzxwq
4429IEMIMPL_V_PMOV_SZ_X pmovzxdq
4430
4431
4432;;
4433; Need to move this as well somewhere better?
4434;
4435struc IEMSSERESULT
4436 .uResult resd 4
4437 .MXCSR resd 1
4438endstruc
4439
4440
4441;;
4442; Need to move this as well somewhere better?
4443;
4444struc IEMAVX128RESULT
4445 .uResult resd 4
4446 .MXCSR resd 1
4447endstruc
4448
4449
4450;;
4451; Need to move this as well somewhere better?
4452;
4453struc IEMAVX256RESULT
4454 .uResult resd 8
4455 .MXCSR resd 1
4456endstruc
4457
4458
4459;;
4460; Initialize the SSE MXCSR register using the guest value partially to
4461; account for rounding mode.
4462;
4463; @uses 4 bytes of stack to save the original value, T0.
4464; @param 1 Expression giving the address of the FXSTATE of the guest.
4465;
4466%macro SSE_LD_FXSTATE_MXCSR 1
4467 sub xSP, 4
4468
4469 stmxcsr [xSP]
4470 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4471 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4472 or T0_32, X86_MXCSR_XCPT_MASK
4473 sub xSP, 4
4474 mov [xSP], T0_32
4475 ldmxcsr [xSP]
4476 add xSP, 4
4477%endmacro
4478
4479
4480;;
4481; Restores the SSE MXCSR register with the original value.
4482;
4483; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4484; @param 1 Expression giving the address where to return the MXCSR value.
4485; @param 2 Expression giving the address of the FXSTATE of the guest.
4486;
4487; @note Restores the stack pointer.
4488;
4489%macro SSE_ST_FXSTATE_MXCSR 2
4490 sub xSP, 4
4491 stmxcsr [xSP]
4492 mov T0_32, [xSP]
4493 add xSP, 4
4494 ; Merge the status bits into the original MXCSR value.
4495 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4496 and T0_32, X86_MXCSR_XCPT_FLAGS
4497 or T0_32, T1_32
4498 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4499
4500 ldmxcsr [xSP]
4501 add xSP, 4
4502%endmacro
4503
4504
4505;;
4506; Initialize the SSE MXCSR register using the guest value partially to
4507; account for rounding mode.
4508;
4509; @uses 4 bytes of stack to save the original value.
4510; @param 1 Expression giving the address of the FXSTATE of the guest.
4511;
4512%macro AVX_LD_XSAVEAREA_MXCSR 1
4513 sub xSP, 4
4514
4515 stmxcsr [xSP]
4516 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4517 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4518 sub xSP, 4
4519 mov [xSP], T0_32
4520 ldmxcsr [xSP]
4521 add xSP, 4
4522%endmacro
4523
4524
4525;;
4526; Restores the AVX128 MXCSR register with the original value.
4527;
4528; @param 1 Expression giving the address where to return the MXCSR value.
4529;
4530; @note Restores the stack pointer.
4531;
4532%macro AVX128_ST_XSAVEAREA_MXCSR 1
4533 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4534
4535 ldmxcsr [xSP]
4536 add xSP, 4
4537%endmacro
4538
4539
4540;;
4541; Restores the AVX256 MXCSR register with the original value.
4542;
4543; @param 1 Expression giving the address where to return the MXCSR value.
4544;
4545; @note Restores the stack pointer.
4546;
4547%macro AVX256_ST_XSAVEAREA_MXCSR 1
4548 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4549
4550 ldmxcsr [xSP]
4551 add xSP, 4
4552%endmacro
4553
4554
4555;;
4556; Floating point instruction working on two full sized registers.
4557;
4558; @param 1 The instruction
4559; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4560;
4561; @param A0 FPU context (FXSTATE or XSAVEAREA).
4562; @param A1 Where to return the result including the MXCSR value.
4563; @param A2 Pointer to the first media register size operand (input/output).
4564; @param A3 Pointer to the second media register size operand (input).
4565;
4566%macro IEMIMPL_FP_F2 2
4567BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4568 PROLOGUE_4_ARGS
4569 IEMIMPL_SSE_PROLOGUE
4570 SSE_LD_FXSTATE_MXCSR A0
4571
4572 movdqu xmm0, [A2]
4573 movdqu xmm1, [A3]
4574 %1 xmm0, xmm1
4575 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4576
4577 SSE_ST_FXSTATE_MXCSR A1, A0
4578 IEMIMPL_SSE_PROLOGUE
4579 EPILOGUE_4_ARGS
4580ENDPROC iemAImpl_ %+ %1 %+ _u128
4581
4582 %if %2 == 3
4583BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4584 PROLOGUE_4_ARGS
4585 IEMIMPL_AVX_PROLOGUE
4586 AVX_LD_XSAVEAREA_MXCSR A0
4587
4588 vmovdqu xmm0, [A2]
4589 vmovdqu xmm1, [A3]
4590 v %+ %1 xmm0, xmm0, xmm1
4591 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4592
4593 AVX128_ST_XSAVEAREA_MXCSR A1
4594 IEMIMPL_AVX_PROLOGUE
4595 EPILOGUE_4_ARGS
4596ENDPROC iemAImpl_v %+ %1 %+ _u128
4597
4598BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4599 PROLOGUE_4_ARGS
4600 IEMIMPL_AVX_PROLOGUE
4601 AVX_LD_XSAVEAREA_MXCSR A0
4602
4603 vmovdqu ymm0, [A2]
4604 vmovdqu ymm1, [A3]
4605 v %+ %1 ymm0, ymm0, ymm1
4606 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4607
4608 AVX256_ST_XSAVEAREA_MXCSR A1
4609 IEMIMPL_AVX_PROLOGUE
4610 EPILOGUE_4_ARGS
4611ENDPROC iemAImpl_v %+ %1 %+ _u256
4612 %elif %2 == 2
4613BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4614 PROLOGUE_4_ARGS
4615 IEMIMPL_AVX_PROLOGUE
4616 AVX_LD_XSAVEAREA_MXCSR A0
4617
4618 vmovdqu xmm0, [A2]
4619 vmovdqu xmm1, [A3]
4620 v %+ %1 xmm0, xmm1
4621 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4622
4623 AVX128_ST_XSAVEAREA_MXCSR A1
4624 IEMIMPL_AVX_PROLOGUE
4625 EPILOGUE_4_ARGS
4626ENDPROC iemAImpl_v %+ %1 %+ _u128
4627
4628BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4629 PROLOGUE_4_ARGS
4630 IEMIMPL_AVX_PROLOGUE
4631 AVX_LD_XSAVEAREA_MXCSR A0
4632
4633 vmovdqu ymm0, [A2]
4634 vmovdqu ymm1, [A3]
4635 v %+ %1 ymm0, ymm1
4636 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4637
4638 AVX256_ST_XSAVEAREA_MXCSR A1
4639 IEMIMPL_AVX_PROLOGUE
4640 EPILOGUE_4_ARGS
4641ENDPROC iemAImpl_v %+ %1 %+ _u256
4642 %endif
4643%endmacro
4644
4645IEMIMPL_FP_F2 addps, 3
4646IEMIMPL_FP_F2 addpd, 3
4647IEMIMPL_FP_F2 mulps, 3
4648IEMIMPL_FP_F2 mulpd, 3
4649IEMIMPL_FP_F2 subps, 3
4650IEMIMPL_FP_F2 subpd, 3
4651IEMIMPL_FP_F2 minps, 3
4652IEMIMPL_FP_F2 minpd, 3
4653IEMIMPL_FP_F2 divps, 3
4654IEMIMPL_FP_F2 divpd, 3
4655IEMIMPL_FP_F2 maxps, 3
4656IEMIMPL_FP_F2 maxpd, 3
4657IEMIMPL_FP_F2 haddps, 3
4658IEMIMPL_FP_F2 haddpd, 3
4659IEMIMPL_FP_F2 hsubps, 3
4660IEMIMPL_FP_F2 hsubpd, 3
4661IEMIMPL_FP_F2 addsubps, 3
4662IEMIMPL_FP_F2 addsubpd, 3
4663
4664
4665;;
4666; These are actually unary operations but to keep it simple
4667; we treat them as binary for now, so the output result is
4668; always in sync with the register where the result might get written
4669; to.
4670IEMIMPL_FP_F2 sqrtps, 2
4671IEMIMPL_FP_F2 rsqrtps, 2
4672IEMIMPL_FP_F2 sqrtpd, 2
4673IEMIMPL_FP_F2 cvtdq2ps, 2
4674IEMIMPL_FP_F2 cvtps2dq, 2
4675IEMIMPL_FP_F2 cvttps2dq, 2
4676IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4677IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4678IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4679
4680
4681;;
4682; Floating point instruction working on a full sized register and a single precision operand.
4683;
4684; @param 1 The instruction
4685;
4686; @param A0 FPU context (FXSTATE or XSAVEAREA).
4687; @param A1 Where to return the result including the MXCSR value.
4688; @param A2 Pointer to the first media register size operand (input/output).
4689; @param A3 Pointer to the second single precision floating point value (input).
4690;
4691%macro IEMIMPL_FP_F2_R32 1
4692BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4693 PROLOGUE_4_ARGS
4694 IEMIMPL_SSE_PROLOGUE
4695 SSE_LD_FXSTATE_MXCSR A0
4696
4697 movdqu xmm0, [A2]
4698 movd xmm1, [A3]
4699 %1 xmm0, xmm1
4700 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4701
4702 SSE_ST_FXSTATE_MXCSR A1, A0
4703 IEMIMPL_SSE_EPILOGUE
4704 EPILOGUE_4_ARGS
4705ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4706
4707BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4708 PROLOGUE_4_ARGS
4709 IEMIMPL_AVX_PROLOGUE
4710 AVX_LD_XSAVEAREA_MXCSR A0
4711
4712 vmovdqu xmm0, [A2]
4713 vmovd xmm1, [A3]
4714 v %+ %1 xmm0, xmm0, xmm1
4715 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4716
4717 AVX128_ST_XSAVEAREA_MXCSR A1
4718 IEMIMPL_AVX_PROLOGUE
4719 EPILOGUE_4_ARGS
4720ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4721%endmacro
4722
4723IEMIMPL_FP_F2_R32 addss
4724IEMIMPL_FP_F2_R32 mulss
4725IEMIMPL_FP_F2_R32 subss
4726IEMIMPL_FP_F2_R32 minss
4727IEMIMPL_FP_F2_R32 divss
4728IEMIMPL_FP_F2_R32 maxss
4729IEMIMPL_FP_F2_R32 cvtss2sd
4730IEMIMPL_FP_F2_R32 sqrtss
4731IEMIMPL_FP_F2_R32 rsqrtss
4732
4733
4734;;
4735; Floating point instruction working on a full sized register and a double precision operand.
4736;
4737; @param 1 The instruction
4738;
4739; @param A0 FPU context (FXSTATE or XSAVEAREA).
4740; @param A1 Where to return the result including the MXCSR value.
4741; @param A2 Pointer to the first media register size operand (input/output).
4742; @param A3 Pointer to the second double precision floating point value (input).
4743;
4744%macro IEMIMPL_FP_F2_R64 1
4745BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4746 PROLOGUE_4_ARGS
4747 IEMIMPL_SSE_PROLOGUE
4748 SSE_LD_FXSTATE_MXCSR A0
4749
4750 movdqu xmm0, [A2]
4751 movq xmm1, [A3]
4752 %1 xmm0, xmm1
4753 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4754
4755 SSE_ST_FXSTATE_MXCSR A1, A0
4756 IEMIMPL_SSE_EPILOGUE
4757 EPILOGUE_4_ARGS
4758ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4759
4760BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4761 PROLOGUE_4_ARGS
4762 IEMIMPL_AVX_PROLOGUE
4763 AVX_LD_XSAVEAREA_MXCSR A0
4764
4765 vmovdqu xmm0, [A2]
4766 vmovq xmm1, [A3]
4767 v %+ %1 xmm0, xmm0, xmm1
4768 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4769
4770 AVX128_ST_XSAVEAREA_MXCSR A1
4771 IEMIMPL_AVX_EPILOGUE
4772 EPILOGUE_4_ARGS
4773ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4774%endmacro
4775
4776IEMIMPL_FP_F2_R64 addsd
4777IEMIMPL_FP_F2_R64 mulsd
4778IEMIMPL_FP_F2_R64 subsd
4779IEMIMPL_FP_F2_R64 minsd
4780IEMIMPL_FP_F2_R64 divsd
4781IEMIMPL_FP_F2_R64 maxsd
4782IEMIMPL_FP_F2_R64 cvtsd2ss
4783IEMIMPL_FP_F2_R64 sqrtsd
4784
4785
4786;;
4787; Macro for the cvtpd2ps/cvtps2pd instructions.
4788;
4789; 1 The instruction name.
4790; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4791;
4792; @param A0 FPU context (FXSTATE or XSAVEAREA).
4793; @param A1 Where to return the result including the MXCSR value.
4794; @param A2 Pointer to the first media register size operand (input/output).
4795; @param A3 Pointer to the second media register size operand (input).
4796;
4797%macro IEMIMPL_CVT_F2 2
4798BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4799 PROLOGUE_4_ARGS
4800 IEMIMPL_SSE_PROLOGUE
4801 SSE_LD_FXSTATE_MXCSR A0
4802
4803 movdqu xmm0, [A2]
4804 movdqu xmm1, [A3]
4805 %1 xmm0, xmm1
4806 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4807
4808 SSE_ST_FXSTATE_MXCSR A1, A0
4809 IEMIMPL_SSE_EPILOGUE
4810 EPILOGUE_4_ARGS
4811ENDPROC iemAImpl_ %+ %1 %+ _u128
4812
4813BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4814 PROLOGUE_4_ARGS
4815 IEMIMPL_AVX_PROLOGUE
4816 AVX_LD_XSAVEAREA_MXCSR A0
4817
4818 vmovdqu xmm0, [A2]
4819 vmovdqu xmm1, [A3]
4820 v %+ %1 xmm0, xmm1
4821 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4822
4823 AVX128_ST_XSAVEAREA_MXCSR A1
4824 IEMIMPL_AVX_EPILOGUE
4825 EPILOGUE_4_ARGS
4826ENDPROC iemAImpl_v %+ %1 %+ _u128
4827
4828BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4829 PROLOGUE_4_ARGS
4830 IEMIMPL_AVX_PROLOGUE
4831 AVX_LD_XSAVEAREA_MXCSR A0
4832
4833 vmovdqu ymm0, [A2]
4834 vmovdqu ymm1, [A3]
4835 %if %2 == 0
4836 v %+ %1 xmm0, ymm1
4837 %else
4838 v %+ %1 ymm0, xmm1
4839 %endif
4840 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4841
4842 AVX256_ST_XSAVEAREA_MXCSR A1
4843 IEMIMPL_AVX_EPILOGUE
4844 EPILOGUE_4_ARGS
4845ENDPROC iemAImpl_v %+ %1 %+ _u256
4846%endmacro
4847
4848IEMIMPL_CVT_F2 cvtpd2ps, 0
4849IEMIMPL_CVT_F2 cvtps2pd, 1
4850
4851
4852;;
4853; shufps instructions with 8-bit immediates.
4854;
4855; @param A0 Pointer to the destination media register size operand (input/output).
4856; @param A1 Pointer to the first source media register size operand (input).
4857; @param A2 The 8-bit immediate
4858;
4859BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4860 PROLOGUE_3_ARGS
4861 IEMIMPL_SSE_PROLOGUE
4862
4863 movdqu xmm0, [A0]
4864 movdqu xmm1, [A1]
4865 lea T1, [.imm0 xWrtRIP]
4866 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4867 lea T1, [T1 + T0*2]
4868 call T1
4869 movdqu [A0], xmm0
4870
4871 IEMIMPL_SSE_EPILOGUE
4872 EPILOGUE_3_ARGS
4873 %assign bImm 0
4874 %rep 256
4875.imm %+ bImm:
4876 shufps xmm0, xmm1, bImm
4877 ret
4878 int3
4879 %assign bImm bImm + 1
4880 %endrep
4881.immEnd: ; 256*6 == 0x600
4882dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4883dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4884ENDPROC iemAImpl_shufps_u128
4885
4886
4887;;
4888; shufpd instruction with 8-bit immediates.
4889;
4890; @param A0 Pointer to the destination media register size operand (input/output).
4891; @param A1 Pointer to the first source media register size operand (input).
4892; @param A2 The 8-bit immediate
4893;
4894BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4895 PROLOGUE_3_ARGS
4896 IEMIMPL_SSE_PROLOGUE
4897
4898 movdqu xmm0, [A0]
4899 movdqu xmm1, [A1]
4900 lea T1, [.imm0 xWrtRIP]
4901 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4902 lea T1, [T1 + T0*2]
4903 call T1
4904 movdqu [A0], xmm0
4905
4906 IEMIMPL_SSE_EPILOGUE
4907 EPILOGUE_3_ARGS
4908 %assign bImm 0
4909 %rep 256
4910.imm %+ bImm:
4911 shufpd xmm0, xmm1, bImm
4912 ret
4913 %assign bImm bImm + 1
4914 %endrep
4915.immEnd: ; 256*6 == 0x600
4916dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4917dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4918ENDPROC iemAImpl_shufpd_u128
4919
4920
4921;;
4922; vshufp{s,d} instructions with 8-bit immediates.
4923;
4924; @param 1 The instruction name.
4925;
4926; @param A0 Pointer to the destination media register size operand (output).
4927; @param A1 Pointer to the first source media register size operand (input).
4928; @param A2 Pointer to the second source media register size operand (input).
4929; @param A3 The 8-bit immediate
4930;
4931%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4932BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4933 PROLOGUE_4_ARGS
4934 IEMIMPL_AVX_PROLOGUE
4935
4936 movdqu xmm0, [A1]
4937 movdqu xmm1, [A2]
4938 lea T1, [.imm0 xWrtRIP]
4939 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4940 lea T1, [T1 + T0*2]
4941 call T1
4942 movdqu [A0], xmm0
4943
4944 IEMIMPL_AVX_EPILOGUE
4945 EPILOGUE_4_ARGS
4946 %assign bImm 0
4947 %rep 256
4948.imm %+ bImm:
4949 %1 xmm0, xmm0, xmm1, bImm
4950 ret
4951 %assign bImm bImm + 1
4952 %endrep
4953.immEnd: ; 256*6 == 0x600
4954dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4955dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4956ENDPROC iemAImpl_ %+ %1 %+ _u128
4957
4958BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4959 PROLOGUE_4_ARGS
4960 IEMIMPL_AVX_PROLOGUE
4961
4962 vmovdqu ymm0, [A1]
4963 vmovdqu ymm1, [A2]
4964 lea T1, [.imm0 xWrtRIP]
4965 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4966 lea T1, [T1 + T0*2]
4967 call T1
4968 vmovdqu [A0], ymm0
4969
4970 IEMIMPL_AVX_EPILOGUE
4971 EPILOGUE_4_ARGS
4972 %assign bImm 0
4973 %rep 256
4974.imm %+ bImm:
4975 %1 ymm0, ymm0, ymm1, bImm
4976 ret
4977 %assign bImm bImm + 1
4978 %endrep
4979.immEnd: ; 256*6 == 0x600
4980dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4981dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4982ENDPROC iemAImpl_ %+ %1 %+ _u256
4983%endmacro
4984
4985IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
4986IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
4987
4988
4989;;
4990; One of the [p]blendv{b,ps,pd} variants
4991;
4992; @param 1 The instruction
4993;
4994; @param A0 Pointer to the first media register sized operand (input/output).
4995; @param A1 Pointer to the second media sized value (input).
4996; @param A2 Pointer to the media register sized mask value (input).
4997;
4998%macro IEMIMPL_P_BLEND 1
4999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5000 PROLOGUE_3_ARGS
5001 IEMIMPL_SSE_PROLOGUE
5002
5003 movdqu xmm0, [A2] ; This is implicit
5004 movdqu xmm1, [A0]
5005 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5006 %1 xmm1, xmm2
5007 movdqu [A0], xmm1
5008
5009 IEMIMPL_SSE_PROLOGUE
5010 EPILOGUE_3_ARGS
5011ENDPROC iemAImpl_ %+ %1 %+ _u128
5012%endmacro
5013
5014IEMIMPL_P_BLEND pblendvb
5015IEMIMPL_P_BLEND blendvps
5016IEMIMPL_P_BLEND blendvpd
5017
5018
5019;;
5020; One of the v[p]blendv{b,ps,pd} variants
5021;
5022; @param 1 The instruction
5023;
5024; @param A0 Pointer to the first media register sized operand (output).
5025; @param A1 Pointer to the first media register sized operand (input).
5026; @param A2 Pointer to the second media register sized operand (input).
5027; @param A3 Pointer to the media register sized mask value (input).
5028%macro IEMIMPL_AVX_P_BLEND 1
5029BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5030 PROLOGUE_4_ARGS
5031 IEMIMPL_AVX_PROLOGUE
5032
5033 vmovdqu xmm0, [A1]
5034 vmovdqu xmm1, [A2]
5035 vmovdqu xmm2, [A3]
5036 %1 xmm0, xmm0, xmm1, xmm2
5037 vmovdqu [A0], xmm0
5038
5039 IEMIMPL_AVX_PROLOGUE
5040 EPILOGUE_4_ARGS
5041ENDPROC iemAImpl_ %+ %1 %+ _u128
5042
5043BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5044 PROLOGUE_4_ARGS
5045 IEMIMPL_AVX_PROLOGUE
5046
5047 vmovdqu ymm0, [A1]
5048 vmovdqu ymm1, [A2]
5049 vmovdqu ymm2, [A3]
5050 %1 ymm0, ymm0, ymm1, ymm2
5051 vmovdqu [A0], ymm0
5052
5053 IEMIMPL_AVX_PROLOGUE
5054 EPILOGUE_4_ARGS
5055ENDPROC iemAImpl_ %+ %1 %+ _u256
5056%endmacro
5057
5058IEMIMPL_AVX_P_BLEND vpblendvb
5059IEMIMPL_AVX_P_BLEND vblendvps
5060IEMIMPL_AVX_P_BLEND vblendvpd
5061
5062
5063;;
5064; palignr mm1, mm2/m64 instruction.
5065;
5066; @param A0 Pointer to the first media register sized operand (output).
5067; @param A1 The second register sized operand (input).
5068; @param A2 The 8-bit immediate.
5069BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5070 PROLOGUE_3_ARGS
5071 IEMIMPL_MMX_PROLOGUE
5072
5073 movq mm0, [A0]
5074 movq mm1, A1
5075 lea T1, [.imm0 xWrtRIP]
5076 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5077 lea T1, [T1 + T0*2]
5078 call T1
5079 movq [A0], mm0
5080
5081 IEMIMPL_MMX_EPILOGUE
5082 EPILOGUE_3_ARGS
5083 %assign bImm 0
5084 %rep 256
5085.imm %+ bImm:
5086 palignr mm0, mm1, bImm
5087 ret
5088 %assign bImm bImm + 1
5089 %endrep
5090.immEnd: ; 256*6 == 0x600
5091dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5092dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5093ENDPROC iemAImpl_palignr_u64
5094
5095
5096;;
5097; SSE instructions with 8-bit immediates of the form
5098; xxx xmm1, xmm2, imm8.
5099; where the instruction encoding takes up 6 bytes.
5100;
5101; @param 1 The instruction name.
5102;
5103; @param A0 Pointer to the first media register size operand (input/output).
5104; @param A1 Pointer to the second source media register size operand (input).
5105; @param A2 The 8-bit immediate
5106;
5107%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5108BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5109 PROLOGUE_3_ARGS
5110 IEMIMPL_SSE_PROLOGUE
5111
5112 movdqu xmm0, [A0]
5113 movdqu xmm1, [A1]
5114 lea T1, [.imm0 xWrtRIP]
5115 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5116 lea T1, [T1 + T0*2]
5117 call T1
5118 movdqu [A0], xmm0
5119
5120 IEMIMPL_SSE_EPILOGUE
5121 EPILOGUE_3_ARGS
5122 %assign bImm 0
5123 %rep 256
5124.imm %+ bImm:
5125 %1 xmm0, xmm1, bImm
5126 ret
5127 int3
5128 %assign bImm bImm + 1
5129 %endrep
5130.immEnd: ; 256*8 == 0x800
5131dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5132dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5133ENDPROC iemAImpl_ %+ %1 %+ _u128
5134%endmacro
5135
5136IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5137IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5138IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5139IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5140IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5141IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5142
5143
5144;;
5145; AVX instructions with 8-bit immediates of the form
5146; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5147; where the instruction encoding takes up 6 bytes.
5148;
5149; @param 1 The instruction name.
5150; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5151;
5152; @param A0 Pointer to the destination media register size operand (output).
5153; @param A1 Pointer to the first source media register size operand (input).
5154; @param A2 Pointer to the second source media register size operand (input).
5155; @param A3 The 8-bit immediate
5156;
5157%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5158BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5159 PROLOGUE_4_ARGS
5160 IEMIMPL_AVX_PROLOGUE
5161
5162 movdqu xmm0, [A1]
5163 movdqu xmm1, [A2]
5164 lea T1, [.imm0 xWrtRIP]
5165 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5166 lea T1, [T1 + T0*2]
5167 call T1
5168 movdqu [A0], xmm0
5169
5170 IEMIMPL_AVX_EPILOGUE
5171 EPILOGUE_4_ARGS
5172 %assign bImm 0
5173 %rep 256
5174.imm %+ bImm:
5175 %1 xmm0, xmm0, xmm1, bImm
5176 ret
5177 int3
5178 %assign bImm bImm + 1
5179 %endrep
5180.immEnd: ; 256*8 == 0x800
5181dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5182dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5183ENDPROC iemAImpl_ %+ %1 %+ _u128
5184
5185 %if %2 == 1
5186BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5187 PROLOGUE_4_ARGS
5188 IEMIMPL_AVX_PROLOGUE
5189
5190 vmovdqu ymm0, [A1]
5191 vmovdqu ymm1, [A2]
5192 lea T1, [.imm0 xWrtRIP]
5193 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5194 lea T1, [T1 + T0*2]
5195 call T1
5196 vmovdqu [A0], ymm0
5197
5198 IEMIMPL_AVX_EPILOGUE
5199 EPILOGUE_4_ARGS
5200 %assign bImm 0
5201 %rep 256
5202.imm %+ bImm:
5203 %1 ymm0, ymm0, ymm1, bImm
5204 ret
5205 int3
5206 %assign bImm bImm + 1
5207 %endrep
5208.immEnd: ; 256*8 == 0x800
5209dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5210dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5211ENDPROC iemAImpl_ %+ %1 %+ _u256
5212 %endif
5213%endmacro
5214
5215IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5216IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5217IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5218IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5219IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5220
5221
5222;;
5223; Need to move this as well somewhere better?
5224;
5225struc IEMPCMPISTRXSRC
5226 .uSrc1 resd 4
5227 .uSrc2 resd 4
5228endstruc
5229
5230struc IEMPCMPESTRXSRC
5231 .uSrc1 resd 4
5232 .uSrc2 resd 4
5233 .u64Rax resd 2
5234 .u64Rdx resd 2
5235endstruc
5236
5237;;
5238; The pcmpistri instruction.
5239;
5240; @param A0 Pointer to the ECX register to store the result to (output).
5241; @param A1 Pointer to the EFLAGS register.
5242; @param A2 Pointer to the structure containing the source operands (input).
5243; @param A3 The 8-bit immediate
5244;
5245BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5246 PROLOGUE_4_ARGS
5247 IEMIMPL_SSE_PROLOGUE
5248
5249 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5250 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5251 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5252 lea T1, [.imm0 xWrtRIP]
5253 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5254 lea T1, [T1 + T0*2]
5255 call T1
5256
5257 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5258 mov [T2], ecx
5259
5260 IEMIMPL_SSE_EPILOGUE
5261 EPILOGUE_4_ARGS
5262 %assign bImm 0
5263 %rep 256
5264.imm %+ bImm:
5265 pcmpistri xmm0, xmm1, bImm
5266 ret
5267 int3
5268 %assign bImm bImm + 1
5269 %endrep
5270.immEnd: ; 256*8 == 0x800
5271dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5272dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5273ENDPROC iemAImpl_pcmpistri_u128
5274
5275;;
5276; The pcmpestri instruction.
5277;
5278; @param A0 Pointer to the ECX register to store the result to (output).
5279; @param A1 Pointer to the EFLAGS register.
5280; @param A2 Pointer to the structure containing the source operands (input).
5281; @param A3 The 8-bit immediate
5282;
5283BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5284 PROLOGUE_4_ARGS
5285 IEMIMPL_SSE_PROLOGUE
5286
5287 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5288 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5289 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5290 lea T1, [.imm0 xWrtRIP]
5291 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5292 lea T1, [T1 + T0*2]
5293 push xDX ; xDX can be A1 or A2 depending on the calling convention
5294 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5295 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5296 call T1
5297
5298 pop xDX
5299 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5300 mov [T2], ecx
5301
5302 IEMIMPL_SSE_EPILOGUE
5303 EPILOGUE_4_ARGS
5304 %assign bImm 0
5305 %rep 256
5306.imm %+ bImm:
5307 pcmpestri xmm0, xmm1, bImm
5308 ret
5309 int3
5310 %assign bImm bImm + 1
5311 %endrep
5312.immEnd: ; 256*8 == 0x800
5313dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5314dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5315ENDPROC iemAImpl_pcmpestri_u128
5316
5317;;
5318; The pcmpistrm instruction template.
5319;
5320; @param A0 Pointer to the XMM0 register to store the result to (output).
5321; @param A1 Pointer to the EFLAGS register.
5322; @param A2 Pointer to the structure containing the source operands (input).
5323; @param A3 The 8-bit immediate
5324;
5325BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5326 PROLOGUE_4_ARGS
5327 IEMIMPL_SSE_PROLOGUE
5328
5329 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5330 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5331 lea T1, [.imm0 xWrtRIP]
5332 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5333 lea T1, [T1 + T0*2]
5334 call T1
5335
5336 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5337 movdqu [A0], xmm0
5338
5339 IEMIMPL_SSE_EPILOGUE
5340 EPILOGUE_4_ARGS
5341 %assign bImm 0
5342 %rep 256
5343.imm %+ bImm:
5344 pcmpistrm xmm1, xmm2, bImm
5345 ret
5346 int3
5347 %assign bImm bImm + 1
5348 %endrep
5349.immEnd: ; 256*8 == 0x800
5350dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5351dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5352ENDPROC iemAImpl_pcmpistrm_u128
5353
5354;;
5355; The pcmpestrm instruction template.
5356;
5357; @param A0 Pointer to the XMM0 register to store the result to (output).
5358; @param A1 Pointer to the EFLAGS register.
5359; @param A2 Pointer to the structure containing the source operands (input).
5360; @param A3 The 8-bit immediate
5361;
5362BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5363 PROLOGUE_4_ARGS
5364 IEMIMPL_SSE_PROLOGUE
5365
5366 push xAX
5367 push xDX
5368 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5369 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5370 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax]
5371 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5372 lea T1, [.imm0 xWrtRIP]
5373 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5374 lea T1, [T1 + T0*2]
5375 call T1
5376
5377 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5378 movdqu [A0], xmm0
5379 pop xDX
5380 pop xAX
5381
5382 IEMIMPL_SSE_EPILOGUE
5383 EPILOGUE_4_ARGS
5384 %assign bImm 0
5385 %rep 256
5386.imm %+ bImm:
5387 pcmpestrm xmm1, xmm2, bImm
5388 ret
5389 int3
5390 %assign bImm bImm + 1
5391 %endrep
5392.immEnd: ; 256*8 == 0x800
5393dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5394dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5395ENDPROC iemAImpl_pcmpestrm_u128
5396
5397
5398;;
5399; pinsrw instruction.
5400;
5401; @param A0 Pointer to the first media register size operand (input/output).
5402; @param A1 The 16 bit input operand (input).
5403; @param A2 The 8-bit immediate
5404;
5405BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5406 PROLOGUE_3_ARGS
5407 IEMIMPL_SSE_PROLOGUE
5408
5409 movq mm0, [A0]
5410 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5411 lea T1, [.imm0 xWrtRIP]
5412 lea T1, [T1 + T0]
5413 call T1
5414 movq [A0], mm0
5415
5416 IEMIMPL_SSE_EPILOGUE
5417 EPILOGUE_3_ARGS
5418 %assign bImm 0
5419 %rep 256
5420.imm %+ bImm:
5421 pinsrw mm0, A1_32, bImm
5422 ret
5423 %assign bImm bImm + 1
5424 %endrep
5425.immEnd: ; 256*5 == 0x500
5426dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5427dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5428ENDPROC iemAImpl_pinsrw_u64
5429
5430BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5431 PROLOGUE_3_ARGS
5432 IEMIMPL_SSE_PROLOGUE
5433
5434 movdqu xmm0, [A0]
5435 lea T1, [.imm0 xWrtRIP]
5436 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5437 lea T1, [T1 + T0*2]
5438 call T1
5439 movdqu [A0], xmm0
5440
5441 IEMIMPL_SSE_EPILOGUE
5442 EPILOGUE_3_ARGS
5443 %assign bImm 0
5444 %rep 256
5445.imm %+ bImm:
5446 pinsrw xmm0, A1_32, bImm
5447 ret
5448 %assign bImm bImm + 1
5449 %endrep
5450.immEnd: ; 256*6 == 0x600
5451dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5452dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5453ENDPROC iemAImpl_pinsrw_u128
5454
5455;;
5456; vpinsrw instruction.
5457;
5458; @param A0 Pointer to the first media register size operand (output).
5459; @param A1 Pointer to the source media register size operand (input).
5460; @param A2 The 16 bit input operand (input).
5461; @param A3 The 8-bit immediate
5462;
5463BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5464 PROLOGUE_4_ARGS
5465 IEMIMPL_SSE_PROLOGUE
5466
5467 movdqu xmm0, [A1]
5468 lea T1, [.imm0 xWrtRIP]
5469 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5470 lea T1, [T1 + T0*2]
5471 mov A1, A2 ; A2 requires longer encoding on Windows
5472 call T1
5473 movdqu [A0], xmm0
5474
5475 IEMIMPL_SSE_EPILOGUE
5476 EPILOGUE_4_ARGS
5477 %assign bImm 0
5478 %rep 256
5479.imm %+ bImm:
5480 vpinsrw xmm0, xmm0, A1_32, bImm
5481 ret
5482 %assign bImm bImm + 1
5483 %endrep
5484.immEnd: ; 256*6 == 0x600
5485dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5486dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5487ENDPROC iemAImpl_vpinsrw_u128
5488
5489
5490;;
5491; pextrw instruction.
5492;
5493; @param A0 Pointer to the 16bit output operand (output).
5494; @param A1 Pointer to the media register size operand (input).
5495; @param A2 The 8-bit immediate
5496;
5497BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5498 PROLOGUE_3_ARGS
5499 IEMIMPL_SSE_PROLOGUE
5500
5501 movq mm0, A1
5502 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5503 lea T1, [.imm0 xWrtRIP]
5504 lea T1, [T1 + T0]
5505 call T1
5506 mov word [A0], T0_16
5507
5508 IEMIMPL_SSE_EPILOGUE
5509 EPILOGUE_3_ARGS
5510 %assign bImm 0
5511 %rep 256
5512.imm %+ bImm:
5513 pextrw T0_32, mm0, bImm
5514 ret
5515 %assign bImm bImm + 1
5516 %endrep
5517.immEnd: ; 256*5 == 0x500
5518dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5519dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5520ENDPROC iemAImpl_pextrw_u64
5521
5522BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5523 PROLOGUE_3_ARGS
5524 IEMIMPL_SSE_PROLOGUE
5525
5526 movdqu xmm0, [A1]
5527 lea T1, [.imm0 xWrtRIP]
5528 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5529 lea T1, [T1 + T0*2]
5530 call T1
5531 mov word [A0], T0_16
5532
5533 IEMIMPL_SSE_EPILOGUE
5534 EPILOGUE_3_ARGS
5535 %assign bImm 0
5536 %rep 256
5537.imm %+ bImm:
5538 pextrw T0_32, xmm0, bImm
5539 ret
5540 %assign bImm bImm + 1
5541 %endrep
5542.immEnd: ; 256*6 == 0x600
5543dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5544dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5545ENDPROC iemAImpl_pextrw_u128
5546
5547;;
5548; vpextrw instruction.
5549;
5550; @param A0 Pointer to the 16bit output operand (output).
5551; @param A1 Pointer to the source media register size operand (input).
5552; @param A2 The 8-bit immediate
5553;
5554BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5555 PROLOGUE_3_ARGS
5556 IEMIMPL_SSE_PROLOGUE
5557
5558 movdqu xmm0, [A1]
5559 lea T1, [.imm0 xWrtRIP]
5560 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5561 lea T1, [T1 + T0*2]
5562 call T1
5563 mov word [A0], T0_16
5564
5565 IEMIMPL_SSE_EPILOGUE
5566 EPILOGUE_3_ARGS
5567 %assign bImm 0
5568 %rep 256
5569.imm %+ bImm:
5570 vpextrw T0_32, xmm0, bImm
5571 ret
5572 %assign bImm bImm + 1
5573 %endrep
5574.immEnd: ; 256*6 == 0x600
5575dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5576dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5577ENDPROC iemAImpl_vpextrw_u128
5578
5579
5580;;
5581; movmskp{s,d} SSE instruction template
5582;
5583; @param 1 The SSE instruction name.
5584; @param 2 The AVX instruction name.
5585;
5586; @param A0 Pointer to the output register (output/byte sized).
5587; @param A1 Pointer to the source media register size operand (input).
5588;
5589%macro IEMIMPL_MEDIA_MOVMSK_P 2
5590BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5591 PROLOGUE_2_ARGS
5592 IEMIMPL_SSE_PROLOGUE
5593
5594 movdqu xmm0, [A1]
5595 %1 T0, xmm0
5596 mov byte [A0], T0_8
5597
5598 IEMIMPL_SSE_EPILOGUE
5599 EPILOGUE_2_ARGS
5600ENDPROC iemAImpl_ %+ %1 %+ _u128
5601
5602BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5603 PROLOGUE_2_ARGS
5604 IEMIMPL_AVX_PROLOGUE
5605
5606 movdqu xmm0, [A1]
5607 %2 T0, xmm0
5608 mov byte [A0], T0_8
5609
5610 IEMIMPL_AVX_EPILOGUE
5611 EPILOGUE_2_ARGS
5612ENDPROC iemAImpl_ %+ %2 %+ _u128
5613
5614BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5615 PROLOGUE_2_ARGS
5616 IEMIMPL_AVX_PROLOGUE
5617
5618 vmovdqu ymm0, [A1]
5619 %2 T0, ymm0
5620 mov byte [A0], T0_8
5621
5622 IEMIMPL_AVX_EPILOGUE
5623 EPILOGUE_2_ARGS
5624ENDPROC iemAImpl_ %+ %2 %+ _u256
5625%endmacro
5626
5627IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5628IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5629
5630
5631;;
5632; Restores the SSE MXCSR register with the original value.
5633;
5634; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5635; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5636; @param 2 Expression giving the address of the FXSTATE of the guest.
5637;
5638; @note Restores the stack pointer.
5639;
5640%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5641 sub xSP, 4
5642 stmxcsr [xSP]
5643 mov T0_32, [xSP]
5644 add xSP, 4
5645 ; Merge the status bits into the original MXCSR value.
5646 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5647 and T0_32, X86_MXCSR_XCPT_FLAGS
5648 or T0_32, T1_32
5649 mov [%1], T0_32
5650
5651 ldmxcsr [xSP]
5652 add xSP, 4
5653%endmacro
5654
5655
5656;;
5657; cvttsd2si instruction - 32-bit variant.
5658;
5659; @param A0 FPU context (FXSTATE or XSAVEAREA).
5660; @param A1 Where to return the MXCSR value.
5661; @param A2 Pointer to the result operand (output).
5662; @param A3 Pointer to the second operand (input).
5663;
5664BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5665 PROLOGUE_4_ARGS
5666 IEMIMPL_SSE_PROLOGUE
5667 SSE_LD_FXSTATE_MXCSR A0
5668
5669 cvttsd2si T0_32, [A3]
5670 mov dword [A2], T0_32
5671
5672 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5673 IEMIMPL_SSE_EPILOGUE
5674 EPILOGUE_4_ARGS
5675ENDPROC iemAImpl_cvttsd2si_i32_r64
5676
5677;;
5678; cvttsd2si instruction - 64-bit variant.
5679;
5680; @param A0 FPU context (FXSTATE or XSAVEAREA).
5681; @param A1 Where to return the MXCSR value.
5682; @param A2 Pointer to the result operand (output).
5683; @param A3 Pointer to the second operand (input).
5684;
5685BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5686 PROLOGUE_4_ARGS
5687 IEMIMPL_SSE_PROLOGUE
5688 SSE_LD_FXSTATE_MXCSR A0
5689
5690 cvttsd2si T0, [A3]
5691 mov qword [A2], T0
5692
5693 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5694 IEMIMPL_SSE_EPILOGUE
5695 EPILOGUE_4_ARGS
5696ENDPROC iemAImpl_cvttsd2si_i64_r64
5697
5698
5699;;
5700; cvtsd2si instruction - 32-bit variant.
5701;
5702; @param A0 FPU context (FXSTATE or XSAVEAREA).
5703; @param A1 Where to return the MXCSR value.
5704; @param A2 Pointer to the result operand (output).
5705; @param A3 Pointer to the second operand (input).
5706;
5707BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5708 PROLOGUE_4_ARGS
5709 IEMIMPL_SSE_PROLOGUE
5710 SSE_LD_FXSTATE_MXCSR A0
5711
5712 cvtsd2si T0_32, [A3]
5713 mov dword [A2], T0_32
5714
5715 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5716 IEMIMPL_SSE_EPILOGUE
5717 EPILOGUE_4_ARGS
5718ENDPROC iemAImpl_cvtsd2si_i32_r64
5719
5720;;
5721; cvtsd2si instruction - 64-bit variant.
5722;
5723; @param A0 FPU context (FXSTATE or XSAVEAREA).
5724; @param A1 Where to return the MXCSR value.
5725; @param A2 Pointer to the result operand (output).
5726; @param A3 Pointer to the second operand (input).
5727;
5728BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5729 PROLOGUE_4_ARGS
5730 IEMIMPL_SSE_PROLOGUE
5731 SSE_LD_FXSTATE_MXCSR A0
5732
5733 cvtsd2si T0, [A3]
5734 mov qword [A2], T0
5735
5736 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5737 IEMIMPL_SSE_EPILOGUE
5738 EPILOGUE_4_ARGS
5739ENDPROC iemAImpl_cvtsd2si_i64_r64
5740
5741
5742;;
5743; cvttss2si instruction - 32-bit variant.
5744;
5745; @param A0 FPU context (FXSTATE or XSAVEAREA).
5746; @param A1 Where to return the MXCSR value.
5747; @param A2 Pointer to the result operand (output).
5748; @param A3 Pointer to the second operand (input).
5749;
5750BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5751 PROLOGUE_4_ARGS
5752 IEMIMPL_SSE_PROLOGUE
5753 SSE_LD_FXSTATE_MXCSR A0
5754
5755 cvttss2si T0_32, [A3]
5756 mov dword [A2], T0_32
5757
5758 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5759 IEMIMPL_SSE_EPILOGUE
5760 EPILOGUE_4_ARGS
5761ENDPROC iemAImpl_cvttss2si_i32_r32
5762
5763;;
5764; cvttss2si instruction - 64-bit variant.
5765;
5766; @param A0 FPU context (FXSTATE or XSAVEAREA).
5767; @param A1 Where to return the MXCSR value.
5768; @param A2 Pointer to the result operand (output).
5769; @param A3 Pointer to the second operand (input).
5770;
5771BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5772 PROLOGUE_4_ARGS
5773 IEMIMPL_SSE_PROLOGUE
5774 SSE_LD_FXSTATE_MXCSR A0
5775
5776 cvttss2si T0, [A3]
5777 mov qword [A2], T0
5778
5779 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5780 IEMIMPL_SSE_EPILOGUE
5781 EPILOGUE_4_ARGS
5782ENDPROC iemAImpl_cvttss2si_i64_r32
5783
5784
5785;;
5786; cvtss2si instruction - 32-bit variant.
5787;
5788; @param A0 FPU context (FXSTATE or XSAVEAREA).
5789; @param A1 Where to return the MXCSR value.
5790; @param A2 Pointer to the result operand (output).
5791; @param A3 Pointer to the second operand (input).
5792;
5793BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5794 PROLOGUE_4_ARGS
5795 IEMIMPL_SSE_PROLOGUE
5796 SSE_LD_FXSTATE_MXCSR A0
5797
5798 cvtss2si T0_32, [A3]
5799 mov dword [A2], T0_32
5800
5801 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5802 IEMIMPL_SSE_EPILOGUE
5803 EPILOGUE_4_ARGS
5804ENDPROC iemAImpl_cvtss2si_i32_r32
5805
5806;;
5807; cvtss2si instruction - 64-bit variant.
5808;
5809; @param A0 FPU context (FXSTATE or XSAVEAREA).
5810; @param A1 Where to return the MXCSR value.
5811; @param A2 Pointer to the result operand (output).
5812; @param A3 Pointer to the second operand (input).
5813;
5814BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5815 PROLOGUE_4_ARGS
5816 IEMIMPL_SSE_PROLOGUE
5817 SSE_LD_FXSTATE_MXCSR A0
5818
5819 cvtss2si T0, [A3]
5820 mov qword [A2], T0
5821
5822 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5823 IEMIMPL_SSE_EPILOGUE
5824 EPILOGUE_4_ARGS
5825ENDPROC iemAImpl_cvtss2si_i64_r32
5826
5827
5828;;
5829; cvtsi2ss instruction - 32-bit variant.
5830;
5831; @param A0 FPU context (FXSTATE or XSAVEAREA).
5832; @param A1 Where to return the MXCSR value.
5833; @param A2 Pointer to the result operand (output).
5834; @param A3 Pointer to the second operand (input).
5835;
5836BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5837 PROLOGUE_4_ARGS
5838 IEMIMPL_SSE_PROLOGUE
5839 SSE_LD_FXSTATE_MXCSR A0
5840
5841 cvtsi2ss xmm0, dword [A3]
5842 movd dword [A2], xmm0
5843
5844 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5845 IEMIMPL_SSE_EPILOGUE
5846 EPILOGUE_4_ARGS
5847ENDPROC iemAImpl_cvtsi2ss_r32_i32
5848
5849;;
5850; cvtsi2ss instruction - 64-bit variant.
5851;
5852; @param A0 FPU context (FXSTATE or XSAVEAREA).
5853; @param A1 Where to return the MXCSR value.
5854; @param A2 Pointer to the result operand (output).
5855; @param A3 Pointer to the second operand (input).
5856;
5857BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5858 PROLOGUE_4_ARGS
5859 IEMIMPL_SSE_PROLOGUE
5860 SSE_LD_FXSTATE_MXCSR A0
5861
5862 cvtsi2ss xmm0, qword [A3]
5863 movd dword [A2], xmm0
5864
5865 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5866 IEMIMPL_SSE_EPILOGUE
5867 EPILOGUE_4_ARGS
5868ENDPROC iemAImpl_cvtsi2ss_r32_i64
5869
5870
5871;;
5872; cvtsi2sd instruction - 32-bit variant.
5873;
5874; @param A0 FPU context (FXSTATE or XSAVEAREA).
5875; @param A1 Where to return the MXCSR value.
5876; @param A2 Pointer to the result operand (output).
5877; @param A3 Pointer to the second operand (input).
5878;
5879BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5880 PROLOGUE_4_ARGS
5881 IEMIMPL_SSE_PROLOGUE
5882 SSE_LD_FXSTATE_MXCSR A0
5883
5884 cvtsi2sd xmm0, dword [A3]
5885 movq [A2], xmm0
5886
5887 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5888 IEMIMPL_SSE_EPILOGUE
5889 EPILOGUE_4_ARGS
5890ENDPROC iemAImpl_cvtsi2sd_r64_i32
5891
5892;;
5893; cvtsi2sd instruction - 64-bit variant.
5894;
5895; @param A0 FPU context (FXSTATE or XSAVEAREA).
5896; @param A1 Where to return the MXCSR value.
5897; @param A2 Pointer to the result operand (output).
5898; @param A3 Pointer to the second operand (input).
5899;
5900BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5901 PROLOGUE_4_ARGS
5902 IEMIMPL_SSE_PROLOGUE
5903 SSE_LD_FXSTATE_MXCSR A0
5904
5905 cvtsi2sd xmm0, qword [A3]
5906 movq [A2], xmm0
5907
5908 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5909 IEMIMPL_SSE_EPILOGUE
5910 EPILOGUE_4_ARGS
5911ENDPROC iemAImpl_cvtsi2sd_r64_i64
5912
5913
5914;;
5915; Initialize the SSE MXCSR register using the guest value partially to
5916; account for rounding mode.
5917;
5918; @uses 4 bytes of stack to save the original value, T0.
5919; @param 1 Expression giving the address of the MXCSR register of the guest.
5920;
5921%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5922 sub xSP, 4
5923
5924 stmxcsr [xSP]
5925 mov T0_32, [%1]
5926 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5927 or T0_32, X86_MXCSR_XCPT_MASK
5928 sub xSP, 4
5929 mov [xSP], T0_32
5930 ldmxcsr [xSP]
5931 add xSP, 4
5932%endmacro
5933
5934
5935;;
5936; Restores the SSE MXCSR register with the original value.
5937;
5938; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5939; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5940;
5941; @note Restores the stack pointer.
5942;
5943%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5944 sub xSP, 4
5945 stmxcsr [xSP]
5946 mov T0_32, [xSP]
5947 add xSP, 4
5948 ; Merge the status bits into the original MXCSR value.
5949 mov T1_32, [%1]
5950 and T0_32, X86_MXCSR_XCPT_FLAGS
5951 or T0_32, T1_32
5952 mov [%1], T0_32
5953
5954 ldmxcsr [xSP]
5955 add xSP, 4
5956%endmacro
5957
5958
5959;
5960; UCOMISS (SSE)
5961;
5962; @param A0 Pointer to the MXCSR value (input/output).
5963; @param A1 Pointer to the EFLAGS value (input/output).
5964; @param A2 Pointer to the first source operand (aka readonly destination).
5965; @param A3 Pointer to the second source operand.
5966;
5967BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5968 PROLOGUE_4_ARGS
5969 IEMIMPL_SSE_PROLOGUE
5970 SSE_LD_FXSTATE_MXCSR_ONLY A0
5971
5972 movdqu xmm0, [A2]
5973 movdqu xmm1, [A3]
5974 ucomiss xmm0, xmm1
5975 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5976
5977 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5978 IEMIMPL_SSE_EPILOGUE
5979 EPILOGUE_4_ARGS
5980ENDPROC iemAImpl_ucomiss_u128
5981
5982BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5983 PROLOGUE_4_ARGS
5984 IEMIMPL_SSE_PROLOGUE
5985 SSE_LD_FXSTATE_MXCSR_ONLY A0
5986
5987 movdqu xmm0, [A2]
5988 movdqu xmm1, [A3]
5989 vucomiss xmm0, xmm1
5990 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5991
5992 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5993 IEMIMPL_SSE_EPILOGUE
5994 EPILOGUE_4_ARGS
5995ENDPROC iemAImpl_vucomiss_u128
5996
5997
5998;
5999; UCOMISD (SSE)
6000;
6001; @param A0 Pointer to the MXCSR value (input/output).
6002; @param A1 Pointer to the EFLAGS value (input/output).
6003; @param A2 Pointer to the first source operand (aka readonly destination).
6004; @param A3 Pointer to the second source operand.
6005;
6006BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6007 PROLOGUE_4_ARGS
6008 IEMIMPL_SSE_PROLOGUE
6009 SSE_LD_FXSTATE_MXCSR_ONLY A0
6010
6011 movdqu xmm0, [A2]
6012 movdqu xmm1, [A3]
6013 ucomisd xmm0, xmm1
6014 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6015
6016 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6017 IEMIMPL_SSE_EPILOGUE
6018 EPILOGUE_4_ARGS
6019ENDPROC iemAImpl_ucomisd_u128
6020
6021BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6022 PROLOGUE_4_ARGS
6023 IEMIMPL_SSE_PROLOGUE
6024 SSE_LD_FXSTATE_MXCSR_ONLY A0
6025
6026 movdqu xmm0, [A2]
6027 movdqu xmm1, [A3]
6028 vucomisd xmm0, xmm1
6029 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6030
6031 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6032 IEMIMPL_SSE_EPILOGUE
6033 EPILOGUE_4_ARGS
6034ENDPROC iemAImpl_vucomisd_u128
6035
6036;
6037; COMISS (SSE)
6038;
6039; @param A0 Pointer to the MXCSR value (input/output).
6040; @param A1 Pointer to the EFLAGS value (input/output).
6041; @param A2 Pointer to the first source operand (aka readonly destination).
6042; @param A3 Pointer to the second source operand.
6043;
6044BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6045 PROLOGUE_4_ARGS
6046 IEMIMPL_SSE_PROLOGUE
6047 SSE_LD_FXSTATE_MXCSR_ONLY A0
6048
6049 movdqu xmm0, [A2]
6050 movdqu xmm1, [A3]
6051 comiss xmm0, xmm1
6052 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6053
6054 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6055 IEMIMPL_SSE_EPILOGUE
6056 EPILOGUE_4_ARGS
6057ENDPROC iemAImpl_comiss_u128
6058
6059BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6060 PROLOGUE_4_ARGS
6061 IEMIMPL_SSE_PROLOGUE
6062 SSE_LD_FXSTATE_MXCSR_ONLY A0
6063
6064 movdqu xmm0, [A2]
6065 movdqu xmm1, [A3]
6066 vcomiss xmm0, xmm1
6067 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6068
6069 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6070 IEMIMPL_SSE_EPILOGUE
6071 EPILOGUE_4_ARGS
6072ENDPROC iemAImpl_vcomiss_u128
6073
6074
6075;
6076; COMISD (SSE)
6077;
6078; @param A0 Pointer to the MXCSR value (input/output).
6079; @param A1 Pointer to the EFLAGS value (input/output).
6080; @param A2 Pointer to the first source operand (aka readonly destination).
6081; @param A3 Pointer to the second source operand.
6082;
6083BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6084 PROLOGUE_4_ARGS
6085 IEMIMPL_SSE_PROLOGUE
6086 SSE_LD_FXSTATE_MXCSR_ONLY A0
6087
6088 movdqu xmm0, [A2]
6089 movdqu xmm1, [A3]
6090 comisd xmm0, xmm1
6091 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6092
6093 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6094 IEMIMPL_SSE_EPILOGUE
6095 EPILOGUE_4_ARGS
6096ENDPROC iemAImpl_comisd_u128
6097
6098BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6099 PROLOGUE_4_ARGS
6100 IEMIMPL_SSE_PROLOGUE
6101 SSE_LD_FXSTATE_MXCSR_ONLY A0
6102
6103 movdqu xmm0, [A2]
6104 movdqu xmm1, [A3]
6105 vcomisd xmm0, xmm1
6106 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6107
6108 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6109 IEMIMPL_SSE_EPILOGUE
6110 EPILOGUE_4_ARGS
6111ENDPROC iemAImpl_vcomisd_u128
6112
6113
6114;;
6115; Need to move this as well somewhere better?
6116;
6117struc IEMMEDIAF2XMMSRC
6118 .uSrc1 resd 4
6119 .uSrc2 resd 4
6120endstruc
6121
6122
6123;
6124; CMPPS (SSE)
6125;
6126; @param A0 Pointer to the MXCSR value (input/output).
6127; @param A1 Pointer to the first media register size operand (output).
6128; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6129; @param A3 The 8-bit immediate (input).
6130;
6131BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6132 PROLOGUE_4_ARGS
6133 IEMIMPL_SSE_PROLOGUE
6134 SSE_LD_FXSTATE_MXCSR_ONLY A0
6135
6136 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6137 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6138 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6139 lea T1, [.imm0 xWrtRIP]
6140 lea T1, [T1 + T0]
6141 call T1
6142 movdqu [A1], xmm0
6143
6144 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6145 IEMIMPL_SSE_EPILOGUE
6146 EPILOGUE_4_ARGS
6147 %assign bImm 0
6148 %rep 256
6149.imm %+ bImm:
6150 cmpps xmm0, xmm1, bImm
6151 ret
6152 %assign bImm bImm + 1
6153 %endrep
6154.immEnd: ; 256*5 == 0x500
6155dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6156dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6157ENDPROC iemAImpl_cmpps_u128
6158
6159;;
6160; SSE instructions with 8-bit immediates of the form
6161; xxx xmm1, xmm2, imm8.
6162; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6163; register.
6164;
6165; @param 1 The instruction name.
6166;
6167; @param A0 Pointer to the MXCSR value (input/output).
6168; @param A1 Pointer to the first media register size operand (output).
6169; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6170; @param A3 The 8-bit immediate (input).
6171;
6172%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6173BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6174 PROLOGUE_4_ARGS
6175 IEMIMPL_SSE_PROLOGUE
6176 SSE_LD_FXSTATE_MXCSR_ONLY A0
6177
6178 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6179 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6180 lea T1, [.imm0 xWrtRIP]
6181 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6182 lea T1, [T1 + T0*2]
6183 call T1
6184 movdqu [A1], xmm0
6185
6186 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6187 IEMIMPL_SSE_EPILOGUE
6188 EPILOGUE_4_ARGS
6189 %assign bImm 0
6190 %rep 256
6191.imm %+ bImm:
6192 %1 xmm0, xmm1, bImm
6193 ret
6194 %assign bImm bImm + 1
6195 %endrep
6196.immEnd: ; 256*6 == 0x600
6197dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6198dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6199ENDPROC iemAImpl_ %+ %1 %+ _u128
6200%endmacro
6201
6202IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6203IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6204IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6205
6206;;
6207; SSE instructions with 8-bit immediates of the form
6208; xxx xmm1, xmm2, imm8.
6209; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6210; register.
6211;
6212; @param 1 The instruction name.
6213;
6214; @param A0 Pointer to the MXCSR value (input/output).
6215; @param A1 Pointer to the first media register size operand (output).
6216; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6217; @param A3 The 8-bit immediate (input).
6218;
6219%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6220BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6221 PROLOGUE_4_ARGS
6222 IEMIMPL_SSE_PROLOGUE
6223 SSE_LD_FXSTATE_MXCSR_ONLY A0
6224
6225 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6226 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6227 lea T1, [.imm0 xWrtRIP]
6228 lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3
6229 lea T0, [T0*2]
6230 lea T0, [T0 + A3]
6231 lea T1, [T1 + T0]
6232 call T1
6233 movdqu [A1], xmm0
6234
6235 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6236 IEMIMPL_SSE_EPILOGUE
6237 EPILOGUE_4_ARGS
6238 %assign bImm 0
6239 %rep 256
6240.imm %+ bImm:
6241 %1 xmm0, xmm1, bImm
6242 ret
6243 %assign bImm bImm + 1
6244 %endrep
6245.immEnd: ; 256*(6+1) == 0x700
6246dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6247dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6248ENDPROC iemAImpl_ %+ %1 %+ _u128
6249%endmacro
6250
6251IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6252IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6253IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6254IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6255
6256
6257;;
6258; SSE instructions of the form
6259; xxx mm, xmm.
6260; and we need to load and save the MXCSR register.
6261;
6262; @param 1 The instruction name.
6263;
6264; @param A0 Pointer to the MXCSR value (input/output).
6265; @param A1 Pointer to the first MMX register sized operand (output).
6266; @param A2 Pointer to the media register sized operand (input).
6267;
6268%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6270 PROLOGUE_3_ARGS
6271 IEMIMPL_SSE_PROLOGUE
6272 SSE_LD_FXSTATE_MXCSR_ONLY A0
6273
6274 movdqu xmm0, [A2]
6275 %1 mm0, xmm0
6276 movq [A1], mm0
6277
6278 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6279 IEMIMPL_SSE_EPILOGUE
6280 EPILOGUE_3_ARGS
6281ENDPROC iemAImpl_ %+ %1 %+ _u128
6282%endmacro
6283
6284IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6285IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6286
6287;;
6288; SSE instructions of the form
6289; xxx xmm, xmm/m64.
6290; and we need to load and save the MXCSR register.
6291;
6292; @param 1 The instruction name.
6293;
6294; @param A0 Pointer to the MXCSR value (input/output).
6295; @param A1 Pointer to the first media register sized operand (input/output).
6296; @param A2 The 64bit source value from a MMX media register (input)
6297;
6298%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6299BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6300 PROLOGUE_3_ARGS
6301 IEMIMPL_SSE_PROLOGUE
6302 SSE_LD_FXSTATE_MXCSR_ONLY A0
6303
6304 movdqu xmm0, [A1]
6305 movq mm0, A2
6306 %1 xmm0, mm0
6307 movdqu [A1], xmm0
6308
6309 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6310 IEMIMPL_SSE_EPILOGUE
6311 EPILOGUE_3_ARGS
6312ENDPROC iemAImpl_ %+ %1 %+ _u128
6313%endmacro
6314
6315IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6316IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6317
6318;;
6319; SSE instructions of the form
6320; xxx mm, xmm/m64.
6321; and we need to load and save the MXCSR register.
6322;
6323; @param 1 The instruction name.
6324;
6325; @param A0 Pointer to the MXCSR value (input/output).
6326; @param A1 Pointer to the first MMX media register sized operand (output).
6327; @param A2 The 64bit source value (input).
6328;
6329%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6331 PROLOGUE_3_ARGS
6332 IEMIMPL_SSE_PROLOGUE
6333 SSE_LD_FXSTATE_MXCSR_ONLY A0
6334
6335 movq xmm0, A2
6336 %1 mm0, xmm0
6337 movq [A1], mm0
6338
6339 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6340 IEMIMPL_SSE_EPILOGUE
6341 EPILOGUE_3_ARGS
6342ENDPROC iemAImpl_ %+ %1 %+ _u128
6343%endmacro
6344
6345IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6346IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6347
6348;
6349; All forms of RDRAND and RDSEED
6350;
6351; @param A0 Pointer to the destination operand.
6352; @param A1 Pointer to the EFLAGS value (input/output).
6353;
6354%macro IEMIMPL_RDRAND_RDSEED 3
6355BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6356 PROLOGUE_2_ARGS
6357
6358 %1 %2
6359 mov [A0], %2
6360 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6361
6362 EPILOGUE_2_ARGS
6363ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6364%endmacro
6365
6366IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6367IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6368IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6369IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6370IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6371IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6372
6373
6374;;
6375; sha1rnds4 xmm1, xmm2, imm8.
6376;
6377; @param 1 The instruction name.
6378;
6379; @param A0 Pointer to the first media register size operand (input/output).
6380; @param A1 Pointer to the second source media register size operand (input).
6381; @param A2 The 8-bit immediate
6382;
6383BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6384 PROLOGUE_3_ARGS
6385 IEMIMPL_SSE_PROLOGUE
6386
6387 movdqu xmm0, [A0]
6388 movdqu xmm1, [A1]
6389 lea T1, [.imm0 xWrtRIP]
6390 lea T0, [A2 + A2*2] ; sizeof(insnX+ret) == 6: (A2 * 3) * 2
6391 lea T1, [T1 + T0*2]
6392 call T1
6393 movdqu [A0], xmm0
6394
6395 IEMIMPL_SSE_EPILOGUE
6396 EPILOGUE_3_ARGS
6397 %assign bImm 0
6398 %rep 256
6399.imm %+ bImm:
6400 sha1rnds4 xmm0, xmm1, bImm
6401 ret
6402 %assign bImm bImm + 1
6403 %endrep
6404.immEnd: ; 256*6 == 0x600
6405dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6406dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6407ENDPROC iemAImpl_sha1rnds4_u128
6408
6409
6410;;
6411; sha256rnds2 xmm1, xmm2, <XMM0>.
6412;
6413; @param 1 The instruction name.
6414;
6415; @param A0 Pointer to the first media register size operand (input/output).
6416; @param A1 Pointer to the second source media register size operand (input).
6417; @param A2 Pointer to the implicit XMM0 constants (input).
6418;
6419BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6420 PROLOGUE_3_ARGS
6421 IEMIMPL_SSE_PROLOGUE
6422
6423 movdqu xmm0, [A2]
6424 movdqu xmm1, [A0]
6425 movdqu xmm2, [A1]
6426 sha256rnds2 xmm1, xmm2
6427 movdqu [A0], xmm1
6428
6429 IEMIMPL_SSE_EPILOGUE
6430 EPILOGUE_3_ARGS
6431ENDPROC iemAImpl_sha256rnds2_u128
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette