; $Id: IEMAllAImpl.asm 98921 2023-03-12 16:54:45Z vboxsync $ ;; @file ; IEM - Instruction Implementation in Assembly. ; ; ; Copyright (C) 2011-2023 Oracle and/or its affiliates. ; ; This file is part of VirtualBox base platform packages, as ; available from https://www.virtualbox.org. ; ; This program is free software; you can redistribute it and/or ; modify it under the terms of the GNU General Public License ; as published by the Free Software Foundation, in version 3 of the ; License. ; ; This program is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, see . ; ; SPDX-License-Identifier: GPL-3.0-only ; ;********************************************************************************************************************************* ;* Header Files * ;********************************************************************************************************************************* %include "VBox/asmdefs.mac" %include "VBox/err.mac" %include "iprt/x86.mac" ;********************************************************************************************************************************* ;* Defined Constants And Macros * ;********************************************************************************************************************************* ;; ; RET XX / RET wrapper for fastcall. ; %macro RET_FASTCALL 1 %ifdef RT_ARCH_X86 %ifdef RT_OS_WINDOWS ret %1 %else ret %endif %else ret %endif %endmacro ;; ; NAME for fastcall functions. ; ;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar ; escaping (or whatever the dollar is good for here). Thus the ugly ; prefix argument. ; %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name) %ifdef RT_ARCH_X86 %ifdef RT_OS_WINDOWS %undef NAME_FASTCALL %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs %endif %endif ;; ; BEGINPROC for fastcall functions. ; ; @param 1 The function name (C). ; @param 2 The argument size on x86. ; %macro BEGINPROC_FASTCALL 2 %ifdef ASM_FORMAT_PE export %1=NAME_FASTCALL(%1,%2,$@) %endif %ifdef __NASM__ %ifdef ASM_FORMAT_OMF export NAME(%1) NAME_FASTCALL(%1,%2,$@) %endif %endif %ifndef ASM_FORMAT_BIN global NAME_FASTCALL(%1,%2,$@) %endif NAME_FASTCALL(%1,%2,@): %endmacro ; ; We employ some macro assembly here to hid the calling convention differences. ; %ifdef RT_ARCH_AMD64 %macro PROLOGUE_1_ARGS 0 %endmacro %macro EPILOGUE_1_ARGS 0 ret %endmacro %macro EPILOGUE_1_ARGS_EX 0 ret %endmacro %macro PROLOGUE_2_ARGS 0 %endmacro %macro EPILOGUE_2_ARGS 0 ret %endmacro %macro EPILOGUE_2_ARGS_EX 1 ret %endmacro %macro PROLOGUE_3_ARGS 0 %endmacro %macro EPILOGUE_3_ARGS 0 ret %endmacro %macro EPILOGUE_3_ARGS_EX 1 ret %endmacro %macro PROLOGUE_4_ARGS 0 %endmacro %macro EPILOGUE_4_ARGS 0 ret %endmacro %macro EPILOGUE_4_ARGS_EX 1 ret %endmacro %ifdef ASM_CALL64_GCC %define A0 rdi %define A0_32 edi %define A0_16 di %define A0_8 dil %define A1 rsi %define A1_32 esi %define A1_16 si %define A1_8 sil %define A2 rdx %define A2_32 edx %define A2_16 dx %define A2_8 dl %define A3 rcx %define A3_32 ecx %define A3_16 cx %endif %ifdef ASM_CALL64_MSC %define A0 rcx %define A0_32 ecx %define A0_16 cx %define A0_8 cl %define A1 rdx %define A1_32 edx %define A1_16 dx %define A1_8 dl %define A2 r8 %define A2_32 r8d %define A2_16 r8w %define A2_8 r8b %define A3 r9 %define A3_32 r9d %define A3_16 r9w %endif %define T0 rax %define T0_32 eax %define T0_16 ax %define T0_8 al %define T1 r11 %define T1_32 r11d %define T1_16 r11w %define T1_8 r11b %define T2 r10 ; only AMD64 %define T2_32 r10d %define T2_16 r10w %define T2_8 r10b %else ; x86 %macro PROLOGUE_1_ARGS 0 push edi %endmacro %macro EPILOGUE_1_ARGS 0 pop edi ret 0 %endmacro %macro EPILOGUE_1_ARGS_EX 1 pop edi ret %1 %endmacro %macro PROLOGUE_2_ARGS 0 push edi %endmacro %macro EPILOGUE_2_ARGS 0 pop edi ret 0 %endmacro %macro EPILOGUE_2_ARGS_EX 1 pop edi ret %1 %endmacro %macro PROLOGUE_3_ARGS 0 push ebx mov ebx, [esp + 4 + 4] push edi %endmacro %macro EPILOGUE_3_ARGS_EX 1 %if (%1) < 4 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)." %endif pop edi pop ebx ret %1 %endmacro %macro EPILOGUE_3_ARGS 0 EPILOGUE_3_ARGS_EX 4 %endmacro %macro PROLOGUE_4_ARGS 0 push ebx push edi push esi mov ebx, [esp + 12 + 4 + 0] mov esi, [esp + 12 + 4 + 4] %endmacro %macro EPILOGUE_4_ARGS_EX 1 %if (%1) < 8 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)." %endif pop esi pop edi pop ebx ret %1 %endmacro %macro EPILOGUE_4_ARGS 0 EPILOGUE_4_ARGS_EX 8 %endmacro %define A0 ecx %define A0_32 ecx %define A0_16 cx %define A0_8 cl %define A1 edx %define A1_32 edx %define A1_16 dx %define A1_8 dl %define A2 ebx %define A2_32 ebx %define A2_16 bx %define A2_8 bl %define A3 esi %define A3_32 esi %define A3_16 si %define T0 eax %define T0_32 eax %define T0_16 ax %define T0_8 al %define T1 edi %define T1_32 edi %define T1_16 di %endif ;; ; Load the relevant flags from [%1] if there are undefined flags (%3). ; ; @remarks Clobbers T0, stack. Changes EFLAGS. ; @param A2 The register pointing to the flags. ; @param 1 The parameter (A0..A3) pointing to the eflags. ; @param 2 The set of modified flags. ; @param 3 The set of undefined flags. ; %macro IEM_MAYBE_LOAD_FLAGS 3 ;%if (%3) != 0 pushf ; store current flags mov T0_32, [%1] ; load the guest flags and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags and T0_32, (%2 | %3) ; select the modified and undefined flags. or [xSP], T0 ; merge guest flags with host flags. popf ; load the mixed flags. ;%endif %endmacro ;; ; Load the relevant flags from [%1]. ; ; @remarks Clobbers T0, stack. Changes EFLAGS. ; @param A2 The register pointing to the flags. ; @param 1 The parameter (A0..A3) pointing to the eflags. ; @param 2 The set of flags to load. ; @param 3 The set of undefined flags. ; %macro IEM_LOAD_FLAGS 3 pushf ; store current flags mov T0_32, [%1] ; load the guest flags and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags and T0_32, (%2 | %3) ; select the modified and undefined flags. or [xSP], T0 ; merge guest flags with host flags. popf ; load the mixed flags. %endmacro ;; ; Update the flag. ; ; @remarks Clobbers T0, T1, stack. ; @param 1 The register pointing to the EFLAGS. ; @param 2 The mask of modified flags to save. ; @param 3 The mask of undefined flags to (maybe) save. ; %macro IEM_SAVE_FLAGS 3 %if (%2 | %3) != 0 pushf pop T1 mov T0_32, [%1] ; flags and T0_32, ~(%2 | %3) ; clear the modified & undefined flags. and T1_32, (%2 | %3) ; select the modified and undefined flags. or T0_32, T1_32 ; combine the flags. mov [%1], T0_32 ; save the flags. %endif %endmacro ;; ; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks. ; ; @remarks Clobbers T0, T1, stack. ; @param 1 The register pointing to the EFLAGS. ; @param 2 The mask of modified flags to save. ; @param 3 Mask of additional flags to always clear ; @param 4 Mask of additional flags to always set. ; %macro IEM_SAVE_AND_ADJUST_FLAGS 4 %if (%2 | %3 | %4) != 0 pushf pop T1 mov T0_32, [%1] ; load flags. and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags. and T1_32, (%2) ; select the modified flags. or T0_32, T1_32 ; combine the flags. %if (%4) != 0 or T0_32, %4 ; add the always set flags. %endif mov [%1], T0_32 ; save the result. %endif %endmacro ;; ; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3), ; signed input (%4[%5]) and parity index (%6). ; ; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is ; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX ; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64). ; ; @remarks Clobbers T0, T1, stack, %6, EFLAGS. ; @param 1 The register pointing to the EFLAGS. ; @param 2 The mask of modified flags to save. ; @param 3 Mask of additional flags to always clear ; @param 4 The result register to set SF by. ; @param 5 The width of the %4 register in bits (8, 16, 32, or 64). ; @param 6 The (full) register containing the parity table index. Will be modified! %macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6 %ifdef RT_ARCH_AMD64 pushf pop T2 %else push T0 pushf pop T0 %endif mov T1_32, [%1] ; load flags. and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc. %ifdef RT_ARCH_AMD64 and T2_32, (%2) ; select the modified flags. or T1_32, T2_32 ; combine the flags. %else and T0_32, (%2) ; select the modified flags. or T1_32, T0_32 ; combine the flags. pop T0 %endif ; First calculate SF as it's likely to be refereing to the same register as %6 does. bt %4, %5 - 1 jnc %%sf_clear or T1_32, X86_EFL_SF %%sf_clear: ; Parity last. and %6, 0xff %ifdef RT_ARCH_AMD64 lea T2, [NAME(g_afParity) xWrtRIP] or T1_8, [T2 + %6] %else or T1_8, [NAME(g_afParity) + %6] %endif mov [%1], T1_32 ; save the result. %endmacro ;; ; Calculates the new EFLAGS using fixed clear and set bit masks. ; ; @remarks Clobbers T0. ; @param 1 The register pointing to the EFLAGS. ; @param 2 Mask of additional flags to always clear ; @param 3 Mask of additional flags to always set. ; %macro IEM_ADJUST_FLAGS 3 %if (%2 | %3) != 0 mov T0_32, [%1] ; Load flags. %if (%2) != 0 and T0_32, ~(%2) ; Remove the always cleared flags. %endif %if (%3) != 0 or T0_32, %3 ; Add the always set flags. %endif mov [%1], T0_32 ; Save the result. %endif %endmacro ;; ; Calculates the new EFLAGS using fixed clear and set bit masks. ; ; @remarks Clobbers T0, %4, EFLAGS. ; @param 1 The register pointing to the EFLAGS. ; @param 2 Mask of additional flags to always clear ; @param 3 Mask of additional flags to always set. ; @param 4 The (full) register containing the parity table index. Will be modified! ; %macro IEM_ADJUST_FLAGS_WITH_PARITY 4 mov T0_32, [%1] ; Load flags. and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags. %if (%3) != 0 or T0_32, %3 ; Add the always set flags. %endif and %4, 0xff %ifdef RT_ARCH_AMD64 lea T2, [NAME(g_afParity) xWrtRIP] or T0_8, [T2 + %4] %else or T0_8, [NAME(g_afParity) + %4] %endif mov [%1], T0_32 ; Save the result. %endmacro ;********************************************************************************************************************************* ;* External Symbols * ;********************************************************************************************************************************* extern NAME(g_afParity) ;; ; Macro for implementing a binary operator. ; ; This will generate code for the 8, 16, 32 and 64 bit accesses with locked ; variants, except on 32-bit system where the 64-bit accesses requires hand ; coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; @param 1 The instruction mnemonic. ; @param 2 Non-zero if there should be a locked version. ; @param 3 The modified flags. ; @param 4 The undefined flags. ; %macro IEMIMPL_BIN_OP 4 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 byte [A0], A1_8 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 word [A0], A1_16 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 dword [A0], A1_32 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 qword [A0], A1 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %if %2 != 0 ; locked versions requested? BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 byte [A0], A1_8 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8_locked BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 word [A0], A1_16 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16_locked BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 dword [A0], A1_32 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_locked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 qword [A0], A1 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64_locked %endif ; RT_ARCH_AMD64 %endif ; locked %endmacro ; instr,lock, modified-flags, undefined flags IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF ;; ; Macro for implementing a binary operator, VEX variant with separate input/output. ; ; This will generate code for the 32 and 64 bit accesses, except on 32-bit system ; where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the first source register operand in A1, the second source register operand ; in A2 and a pointer to eflags in A3. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; %macro IEMIMPL_VEX_BIN_OP 3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %1 T0_32, A1_32, A2_32 mov [A0], T0_32 IEM_SAVE_FLAGS A3, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %1 T0, A1, A2 mov [A0], T0 IEM_SAVE_FLAGS A3, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %endmacro ; instr, modified-flags, undefined-flags IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF) IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF) IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF) ;; ; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C). ; ; This will generate code for the 32 and 64 bit accesses, except on 32-bit system ; where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; %macro IEMIMPL_VEX_BIN_OP_2 3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 mov T0_32, [A0] %1 T0_32, A1_32 mov [A0], T0_32 IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 mov T0, [A0] %1 T0, A1 mov [A0], T0 IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %endmacro ; instr, modified-flags, undefined-flags IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF) IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF) IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF) ;; ; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output. ; ; This will generate code for the 32 and 64 bit accesses, except on 32-bit system ; where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the first source register operand in A1, the second source register operand ; in A2 and a pointer to eflags in A3. ; ; @param 1 The instruction mnemonic. ; @param 2 Fallback instruction if applicable. ; @param 3 Whether to emit fallback or not. ; %macro IEMIMPL_VEX_BIN_OP_NOEFL 3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS %1 T0_32, A1_32, A2_32 mov [A0], T0_32 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %if %3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12 PROLOGUE_3_ARGS %ifdef ASM_CALL64_GCC mov cl, A2_8 %2 A1_32, cl mov [A0], A1_32 %else xchg A2, A0 %2 A1_32, cl mov [A2], A1_32 %endif EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback %endif %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 PROLOGUE_3_ARGS %1 T0, A1, A2 mov [A0], T0 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %if %3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12 PROLOGUE_3_ARGS %ifdef ASM_CALL64_GCC mov cl, A2_8 %2 A1, cl mov [A0], A1_32 %else xchg A2, A0 %2 A1, cl mov [A2], A1_32 %endif mov [A0], A1 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback %endif %endif ; RT_ARCH_AMD64 %endmacro ; instr, fallback instr, emit fallback IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1 IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1 IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1 IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0 IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0 ; ; RORX uses a immediate byte for the shift count, so we only do ; fallback implementation of that one. ; BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12 PROLOGUE_3_ARGS %ifdef ASM_CALL64_GCC mov cl, A2_8 ror A1_32, cl mov [A0], A1_32 %else xchg A2, A0 ror A1_32, cl mov [A2], A1_32 %endif EPILOGUE_3_ARGS ENDPROC iemAImpl_rorx_u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12 PROLOGUE_3_ARGS %ifdef ASM_CALL64_GCC mov cl, A2_8 ror A1, cl mov [A0], A1_32 %else xchg A2, A0 ror A1, cl mov [A2], A1_32 %endif mov [A0], A1 EPILOGUE_3_ARGS ENDPROC iemAImpl_rorx_u64 %endif ; RT_ARCH_AMD64 ; ; MULX ; BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16 PROLOGUE_4_ARGS %ifdef ASM_CALL64_GCC ; A2_32 is EDX - prefect mulx T0_32, T1_32, A3_32 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers. mov [A0], T0_32 %else ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1 xchg A1, A2 mulx T0_32, T1_32, A3_32 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers. mov [A0], T0_32 %endif EPILOGUE_4_ARGS ENDPROC iemAImpl_mulx_u32 BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16 PROLOGUE_4_ARGS %ifdef ASM_CALL64_GCC ; A2_32 is EDX, T0_32 is EAX mov eax, A3_32 mul A2_32 mov [A1], eax ; Low value first, as we should return the high part if same destination registers. mov [A0], edx %else ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1 xchg A1, A2 mov eax, A3_32 mul A2_32 mov [A2], eax ; Low value first, as we should return the high part if same destination registers. mov [A0], edx %endif EPILOGUE_4_ARGS ENDPROC iemAImpl_mulx_u32_fallback %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16 PROLOGUE_4_ARGS %ifdef ASM_CALL64_GCC ; A2 is RDX - prefect mulx T0, T1, A3 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers. mov [A0], T0 %else ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1 xchg A1, A2 mulx T0, T1, A3 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers. mov [A0], T0 %endif EPILOGUE_4_ARGS ENDPROC iemAImpl_mulx_u64 BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16 PROLOGUE_4_ARGS %ifdef ASM_CALL64_GCC ; A2 is RDX, T0 is RAX mov rax, A3 mul A2 mov [A1], rax ; Low value first, as we should return the high part if same destination registers. mov [A0], rdx %else ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1 xchg A1, A2 mov rax, A3 mul A2 mov [A2], rax ; Low value first, as we should return the high part if same destination registers. mov [A0], rdx %endif EPILOGUE_4_ARGS ENDPROC iemAImpl_mulx_u64_fallback %endif ;; ; Macro for implementing a bit operator. ; ; This will generate code for the 16, 32 and 64 bit accesses with locked ; variants, except on 32-bit system where the 64-bit accesses requires hand ; coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; @param 1 The instruction mnemonic. ; @param 2 Non-zero if there should be a locked version. ; @param 3 The modified flags. ; @param 4 The undefined flags. ; %macro IEMIMPL_BIT_OP 4 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 word [A0], A1_16 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 dword [A0], A1_32 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 %1 qword [A0], A1 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %if %2 != 0 ; locked versions requested? BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 word [A0], A1_16 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16_locked BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 dword [A0], A1_32 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_locked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %3, %4 lock %1 qword [A0], A1 IEM_SAVE_FLAGS A2, %3, %4 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64_locked %endif ; RT_ARCH_AMD64 %endif ; locked %endmacro IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) ;; ; Macro for implementing a bit search operator. ; ; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit ; system where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; In the ZF case the destination register is 'undefined', however it seems that ; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between ; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between ; Intel microarchitectures. We only implement 'intel' and 'amd' variation with ; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X). ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written. ; %macro IEMIMPL_BIT_OP2 4 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0_16, A1_16 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0_16 .unchanged_dst: IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12 PROLOGUE_3_ARGS %1 T1_16, A1_16 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T1_16 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1 EPILOGUE_3_ARGS .unchanged_dst: IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16_intel BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12 PROLOGUE_3_ARGS %1 T0_16, A1_16 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0_16 .unchanged_dst: IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2. EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16_amd BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0_32, A1_32 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0_32 .unchanged_dst: IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12 PROLOGUE_3_ARGS %1 T1_32, A1_32 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T1_32 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1 EPILOGUE_3_ARGS .unchanged_dst: IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_intel BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12 PROLOGUE_3_ARGS %1 T0_32, A1_32 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0_32 .unchanged_dst: IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2. EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_amd %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0, A1 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0 .unchanged_dst: IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T1, A1 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T1 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1 EPILOGUE_3_ARGS .unchanged_dst: IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64_intel BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16 PROLOGUE_3_ARGS %1 T0, A1 %if %4 != 0 jz .unchanged_dst %endif mov [A0], T0 .unchanged_dst: IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2. EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64_amd %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1 IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1 IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0 IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0 ;; ; Macro for implementing POPCNT. ; ; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit ; system where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; ASSUMES Intel and AMD set EFLAGS the same way. ; ; ASSUMES the instruction does not support memory destination. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; %macro IEMIMPL_BIT_OP3 3 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0_16, A1_16 mov [A0], T0_16 IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0_32, A1_32 mov [A0], T0_32 IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %1 T0, A1 mov [A0], T0 IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0 ; ; IMUL is also a similar but yet different case (no lock, no mem dst). ; The rDX:rAX variant of imul is handled together with mul further down. ; BEGINCODE ; @param 1 EFLAGS that are modified. ; @param 2 Undefined EFLAGS. ; @param 3 Function suffix. ; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored), ; 2 for AMD (set AF, clear PF, ZF and SF). %macro IEMIMPL_IMUL_TWO 4 BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %1, %2 imul A1_16, word [A0] mov [A0], A1_16 %if %4 != 1 IEM_SAVE_FLAGS A2, %1, %2 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1 %endif EPILOGUE_3_ARGS ENDPROC iemAImpl_imul_two_u16 %+ %3 BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %1, %2 imul A1_32, dword [A0] mov [A0], A1_32 %if %4 != 1 IEM_SAVE_FLAGS A2, %1, %2 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1 %endif EPILOGUE_3_ARGS ENDPROC iemAImpl_imul_two_u32 %+ %3 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %1, %2 imul A1, qword [A0] mov [A0], A1 %if %4 != 1 IEM_SAVE_FLAGS A2, %1, %2 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1 %endif EPILOGUE_3_ARGS_EX 8 ENDPROC iemAImpl_imul_two_u64 %+ %3 %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0 IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1 IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2 ; ; XCHG for memory operands. This implies locking. No flag changes. ; ; Each function takes two arguments, first the pointer to the memory, ; then the pointer to the register. They all return void. ; BEGINCODE BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8 PROLOGUE_2_ARGS mov T0_8, [A1] xchg [A0], T0_8 mov [A1], T0_8 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u8_locked BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8 PROLOGUE_2_ARGS mov T0_16, [A1] xchg [A0], T0_16 mov [A1], T0_16 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u16_locked BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8 PROLOGUE_2_ARGS mov T0_32, [A1] xchg [A0], T0_32 mov [A1], T0_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u32_locked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8 PROLOGUE_2_ARGS mov T0, [A1] xchg [A0], T0 mov [A1], T0 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u64_locked %endif ; Unlocked variants for fDisregardLock mode. BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8 PROLOGUE_2_ARGS mov T0_8, [A1] mov T1_8, [A0] mov [A0], T0_8 mov [A1], T1_8 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u8_unlocked BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8 PROLOGUE_2_ARGS mov T0_16, [A1] mov T1_16, [A0] mov [A0], T0_16 mov [A1], T1_16 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u16_unlocked BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8 PROLOGUE_2_ARGS mov T0_32, [A1] mov T1_32, [A0] mov [A0], T0_32 mov [A1], T1_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u32_unlocked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8 PROLOGUE_2_ARGS mov T0, [A1] mov T1, [A0] mov [A0], T0 mov [A1], T1 EPILOGUE_2_ARGS ENDPROC iemAImpl_xchg_u64_unlocked %endif ; ; XADD for memory operands. ; ; Each function takes three arguments, first the pointer to the ; memory/register, then the pointer to the register, and finally a pointer to ; eflags. They all return void. ; BEGINCODE BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_8, [A1] xadd [A0], T0_8 mov [A1], T0_8 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u8 BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_16, [A1] xadd [A0], T0_16 mov [A1], T0_16 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u16 BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_32, [A1] xadd [A0], T0_32 mov [A1], T0_32 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0, [A1] xadd [A0], T0 mov [A1], T0 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u64 %endif ; RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_8, [A1] lock xadd [A0], T0_8 mov [A1], T0_8 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u8_locked BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_16, [A1] lock xadd [A0], T0_16 mov [A1], T0_16 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u16_locked BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0_32, [A1] lock xadd [A0], T0_32 mov [A1], T0_32 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u32_locked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 mov T0, [A1] lock xadd [A0], T0 mov [A1], T0 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 EPILOGUE_3_ARGS ENDPROC iemAImpl_xadd_u64_locked %endif ; RT_ARCH_AMD64 ; ; CMPXCHG8B. ; ; These are tricky register wise, so the code is duplicated for each calling ; convention. ; ; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! ; ; C-proto: ; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, ; uint32_t *pEFlags)); ; ; Note! Identical to iemAImpl_cmpxchg16b. ; BEGINCODE BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16 %ifdef RT_ARCH_AMD64 %ifdef ASM_CALL64_MSC push rbx mov r11, rdx ; pu64EaxEdx (is also T1) mov r10, rcx ; pu64Dst mov ebx, [r8] mov ecx, [r8 + 4] IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax) mov eax, [r11] mov edx, [r11 + 4] lock cmpxchg8b [r10] mov [r11], eax mov [r11 + 4], edx IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) pop rbx ret %else push rbx mov r10, rcx ; pEFlags mov r11, rdx ; pu64EbxEcx (is also T1) mov ebx, [r11] mov ecx, [r11 + 4] IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax) mov eax, [rsi] mov edx, [rsi + 4] lock cmpxchg8b [rdi] mov [rsi], eax mov [rsi + 4], edx IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) pop rbx ret %endif %else push esi push edi push ebx push ebp mov edi, ecx ; pu64Dst mov esi, edx ; pu64EaxEdx mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx mov ebp, [esp + 16 + 4 + 4] ; pEFlags mov ebx, [ecx] mov ecx, [ecx + 4] IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax) mov eax, [esi] mov edx, [esi + 4] lock cmpxchg8b [edi] mov [esi], eax mov [esi + 4], edx IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi) pop ebp pop ebx pop edi pop esi ret 8 %endif ENDPROC iemAImpl_cmpxchg8b BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16 ; Lazy bird always lock prefixes cmpxchg8b. jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@) ENDPROC iemAImpl_cmpxchg8b_locked %ifdef RT_ARCH_AMD64 ; ; CMPXCHG16B. ; ; These are tricky register wise, so the code is duplicated for each calling ; convention. ; ; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! ; ; C-proto: ; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx, ; uint32_t *pEFlags)); ; ; Note! Identical to iemAImpl_cmpxchg8b. ; BEGINCODE BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16 %ifdef ASM_CALL64_MSC push rbx mov r11, rdx ; pu64RaxRdx (is also T1) mov r10, rcx ; pu64Dst mov rbx, [r8] mov rcx, [r8 + 8] IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax) mov rax, [r11] mov rdx, [r11 + 8] lock cmpxchg16b [r10] mov [r11], rax mov [r11 + 8], rdx IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) pop rbx ret %else push rbx mov r10, rcx ; pEFlags mov r11, rdx ; pu64RbxRcx (is also T1) mov rbx, [r11] mov rcx, [r11 + 8] IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax) mov rax, [rsi] mov rdx, [rsi + 8] lock cmpxchg16b [rdi] mov [rsi], rax mov [rsi + 8], rdx IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) pop rbx ret %endif ENDPROC iemAImpl_cmpxchg16b BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16 ; Lazy bird always lock prefixes cmpxchg16b. jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@) ENDPROC iemAImpl_cmpxchg16b_locked %endif ; RT_ARCH_AMD64 ; ; CMPXCHG. ; ; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! ; ; C-proto: ; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags)); ; BEGINCODE %macro IEMIMPL_CMPXCHG 2 BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) mov al, [A1] %1 cmpxchg [A0], A2_8 mov [A1], al IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) EPILOGUE_4_ARGS ENDPROC iemAImpl_cmpxchg_u8 %+ %2 BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) mov ax, [A1] %1 cmpxchg [A0], A2_16 mov [A1], ax IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) EPILOGUE_4_ARGS ENDPROC iemAImpl_cmpxchg_u16 %+ %2 BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) mov eax, [A1] %1 cmpxchg [A0], A2_32 mov [A1], eax IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) EPILOGUE_4_ARGS ENDPROC iemAImpl_cmpxchg_u32 %+ %2 BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16 %ifdef RT_ARCH_AMD64 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) mov rax, [A1] %1 cmpxchg [A0], A2 mov [A1], rax IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) EPILOGUE_4_ARGS %else ; ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b. ; push esi push edi push ebx push ebp mov edi, ecx ; pu64Dst mov esi, edx ; pu64Rax mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts! mov ebp, [esp + 16 + 4 + 4] ; pEFlags mov ebx, [ecx] mov ecx, [ecx + 4] IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) mov eax, [esi] mov edx, [esi + 4] lock cmpxchg8b [edi] ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that. jz .cmpxchg8b_not_equal cmp eax, eax ; just set the other flags. .store: mov [esi], eax mov [esi + 4], edx IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi) pop ebp pop ebx pop edi pop esi ret 8 .cmpxchg8b_not_equal: cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation jne .store cmp [esi], eax jmp .store %endif ENDPROC iemAImpl_cmpxchg_u64 %+ %2 %endmacro ; IEMIMPL_CMPXCHG IEMIMPL_CMPXCHG , , IEMIMPL_CMPXCHG lock, _locked ;; ; Macro for implementing a unary operator. ; ; This will generate code for the 8, 16, 32 and 64 bit accesses with locked ; variants, except on 32-bit system where the 64-bit accesses requires hand ; coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the source register operand in A1 and a pointer to eflags in A2. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; %macro IEMIMPL_UNARY_OP 3 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 %1 byte [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 lock %1 byte [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8_locked BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 %1 word [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 lock %1 word [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16_locked BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 %1 dword [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 lock %1 dword [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32_locked %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 %1 qword [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8 PROLOGUE_2_ARGS IEM_MAYBE_LOAD_FLAGS A1, %2, %3 lock %1 qword [A0] IEM_SAVE_FLAGS A1, %2, %3 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64_locked %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0 IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0 IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 IEMIMPL_UNARY_OP not, 0, 0 ; ; BSWAP. No flag changes. ; ; Each function takes one argument, pointer to the value to bswap ; (input/output). They all return void. ; BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4 PROLOGUE_1_ARGS mov T0_32, [A0] ; just in case any of the upper bits are used. db 66h bswap T0_32 mov [A0], T0_32 EPILOGUE_1_ARGS ENDPROC iemAImpl_bswap_u16 BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4 PROLOGUE_1_ARGS mov T0_32, [A0] bswap T0_32 mov [A0], T0_32 EPILOGUE_1_ARGS ENDPROC iemAImpl_bswap_u32 BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4 %ifdef RT_ARCH_AMD64 PROLOGUE_1_ARGS mov T0, [A0] bswap T0 mov [A0], T0 EPILOGUE_1_ARGS %else PROLOGUE_1_ARGS mov T0, [A0] mov T1, [A0 + 4] bswap T0 bswap T1 mov [A0 + 4], T0 mov [A0], T1 EPILOGUE_1_ARGS %endif ENDPROC iemAImpl_bswap_u64 ;; ; Macro for implementing a shift operation. ; ; This will generate code for the 8, 16, 32 and 64 bit accesses, except on ; 32-bit system where the 64-bit accesses requires hand coding. ; ; All the functions takes a pointer to the destination memory operand in A0, ; the shift count in A1 and a pointer to eflags in A2. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; ; Makes ASSUMPTIONS about A0, A1 and A2 assignments. ; ; @note the _intel and _amd variants are implemented in C. ; %macro IEMIMPL_SHIFT_OP 3 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %ifdef ASM_CALL64_GCC mov cl, A1_8 %1 byte [A0], cl %else xchg A1, A0 %1 byte [A1], cl %endif IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %ifdef ASM_CALL64_GCC mov cl, A1_8 %1 word [A0], cl %else xchg A1, A0 %1 word [A1], cl %endif IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %ifdef ASM_CALL64_GCC mov cl, A1_8 %1 dword [A0], cl %else xchg A1, A0 %1 dword [A1], cl %endif IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 %ifdef ASM_CALL64_GCC mov cl, A1_8 %1 qword [A0], cl %else xchg A1, A0 %1 qword [A1], cl %endif IEM_SAVE_FLAGS A2, %2, %3 EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0 IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0 IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0 IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0 IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) ;; ; Macro for implementing a double precision shift operation. ; ; This will generate code for the 16, 32 and 64 bit accesses, except on ; 32-bit system where the 64-bit accesses requires hand coding. ; ; The functions takes the destination operand (r/m) in A0, the source (reg) in ; A1, the shift count in A2 and a pointer to the eflags variable/register in A3. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; ; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments. ; ; @note the _intel and _amd variants are implemented in C. ; %macro IEMIMPL_SHIFT_DBL_OP 3 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %ifdef ASM_CALL64_GCC xchg A3, A2 %1 [A0], A1_16, cl xchg A3, A2 %else xchg A0, A2 %1 [A2], A1_16, cl %endif IEM_SAVE_FLAGS A3, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %ifdef ASM_CALL64_GCC xchg A3, A2 %1 [A0], A1_32, cl xchg A3, A2 %else xchg A0, A2 %1 [A2], A1_32, cl %endif IEM_SAVE_FLAGS A3, %2, %3 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %ifdef ASM_CALL64_GCC xchg A3, A2 %1 [A0], A1, cl xchg A3, A2 %else xchg A0, A2 %1 [A2], A1, cl %endif IEM_SAVE_FLAGS A3, %2, %3 EPILOGUE_4_ARGS_EX 12 ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif ; RT_ARCH_AMD64 %endmacro IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) ;; ; Macro for implementing a multiplication operations. ; ; This will generate code for the 8, 16, 32 and 64 bit accesses, except on ; 32-bit system where the 64-bit accesses requires hand coding. ; ; The 8-bit function only operates on AX, so it takes no DX pointer. The other ; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a ; pointer to eflags in A3. ; ; The functions all return 0 so the caller can be used for div/idiv as well as ; for the mul/imul implementation. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; @param 4 Name suffix. ; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD. ; ; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments. ; %macro IEMIMPL_MUL_OP 5 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12 PROLOGUE_3_ARGS IEM_MAYBE_LOAD_FLAGS A2, %2, %3 mov al, [A0] %1 A1_8 mov [A0], ax %if %5 != 1 IEM_SAVE_FLAGS A2, %2, %3 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX %endif xor eax, eax EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 mov ax, [A0] %ifdef ASM_CALL64_GCC %1 A2_16 mov [A0], ax mov [A1], dx %else mov T1, A1 %1 A2_16 mov [A0], ax mov [T1], dx %endif %if %5 != 1 IEM_SAVE_FLAGS A3, %2, %3 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX %endif xor eax, eax EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 mov eax, [A0] %ifdef ASM_CALL64_GCC %1 A2_32 mov [A0], eax mov [A1], edx %else mov T1, A1 %1 A2_32 mov [A0], eax mov [T1], edx %endif %if %5 != 1 IEM_SAVE_FLAGS A3, %2, %3 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX %endif xor eax, eax EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp. BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20 PROLOGUE_4_ARGS IEM_MAYBE_LOAD_FLAGS A3, %2, %3 mov rax, [A0] %ifdef ASM_CALL64_GCC %1 A2 mov [A0], rax mov [A1], rdx %else mov T1, A1 %1 A2 mov [A0], rax mov [T1], rdx %endif %if %5 != 1 IEM_SAVE_FLAGS A3, %2, %3 %else IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX %endif xor eax, eax EPILOGUE_4_ARGS_EX 12 ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4 %endif ; !RT_ARCH_AMD64 %endmacro IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0 IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1 IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2 IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0 IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1 IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2 BEGINCODE ;; ; Worker function for negating a 32-bit number in T1:T0 ; @uses None (T0,T1) BEGINPROC iemAImpl_negate_T0_T1_u32 push 0 push 0 xchg T0_32, [xSP] xchg T1_32, [xSP + xCB] sub T0_32, [xSP] sbb T1_32, [xSP + xCB] add xSP, xCB*2 ret ENDPROC iemAImpl_negate_T0_T1_u32 %ifdef RT_ARCH_AMD64 ;; ; Worker function for negating a 64-bit number in T1:T0 ; @uses None (T0,T1) BEGINPROC iemAImpl_negate_T0_T1_u64 push 0 push 0 xchg T0, [xSP] xchg T1, [xSP + xCB] sub T0, [xSP] sbb T1, [xSP + xCB] add xSP, xCB*2 ret ENDPROC iemAImpl_negate_T0_T1_u64 %endif ;; ; Macro for implementing a division operations. ; ; This will generate code for the 8, 16, 32 and 64 bit accesses, except on ; 32-bit system where the 64-bit accesses requires hand coding. ; ; The 8-bit function only operates on AX, so it takes no DX pointer. The other ; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a ; pointer to eflags in A3. ; ; The functions all return 0 on success and -1 if a divide error should be ; raised by the caller. ; ; @param 1 The instruction mnemonic. ; @param 2 The modified flags. ; @param 3 The undefined flags. ; @param 4 1 if signed, 0 if unsigned. ; @param 5 Function suffix. ; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored), ; 2 for AMD (set AF, clear PF, ZF and SF). ; ; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments. ; %macro IEMIMPL_DIV_OP 6 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12 PROLOGUE_3_ARGS ; div by chainsaw check. test A1_8, A1_8 jz .div_zero ; Overflow check - unsigned division is simple to verify, haven't ; found a simple way to check signed division yet unfortunately. %if %4 == 0 cmp [A0 + 1], A1_8 jae .div_overflow %else mov T0_16, [A0] ; T0 = dividend mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit) test A1_8, A1_8 js .divisor_negative test T0_16, T0_16 jns .both_positive neg T0_16 .one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). push T0 ; Start off like unsigned below. shr T0_16, 7 cmp T0_8, A1_8 pop T0 jb .div_no_overflow ja .div_overflow and T0_8, 0x7f ; Special case for covering (divisor - 1). cmp T0_8, A1_8 jae .div_overflow jmp .div_no_overflow .divisor_negative: neg A1_8 test T0_16, T0_16 jns .one_of_each neg T0_16 .both_positive: ; Same as unsigned shifted by sign indicator bit. shr T0_16, 7 cmp T0_8, A1_8 jae .div_overflow .div_no_overflow: mov A1, T1 ; restore divisor %endif IEM_MAYBE_LOAD_FLAGS A2, %2, %3 mov ax, [A0] %1 A1_8 mov [A0], ax %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF. IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF %else IEM_SAVE_FLAGS A2, %2, %3 %endif xor eax, eax .return: EPILOGUE_3_ARGS .div_zero: .div_overflow: mov eax, -1 jmp .return ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16 PROLOGUE_4_ARGS ; div by chainsaw check. test A2_16, A2_16 jz .div_zero ; Overflow check - unsigned division is simple to verify, haven't ; found a simple way to check signed division yet unfortunately. %if %4 == 0 cmp [A1], A2_16 jae .div_overflow %else mov T0_16, [A1] shl T0_32, 16 mov T0_16, [A0] ; T0 = dividend mov T1, A2 ; T1 = divisor test T1_16, T1_16 js .divisor_negative test T0_32, T0_32 jns .both_positive neg T0_32 .one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). push T0 ; Start off like unsigned below. shr T0_32, 15 cmp T0_16, T1_16 pop T0 jb .div_no_overflow ja .div_overflow and T0_16, 0x7fff ; Special case for covering (divisor - 1). cmp T0_16, T1_16 jae .div_overflow jmp .div_no_overflow .divisor_negative: neg T1_16 test T0_32, T0_32 jns .one_of_each neg T0_32 .both_positive: ; Same as unsigned shifted by sign indicator bit. shr T0_32, 15 cmp T0_16, T1_16 jae .div_overflow .div_no_overflow: %endif IEM_MAYBE_LOAD_FLAGS A3, %2, %3 %ifdef ASM_CALL64_GCC mov T1, A2 mov ax, [A0] mov dx, [A1] %1 T1_16 mov [A0], ax mov [A1], dx %else mov T1, A1 mov ax, [A0] mov dx, [T1] %1 A2_16 mov [A0], ax mov [T1], dx %endif %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF. IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF %else IEM_SAVE_FLAGS A3, %2, %3 %endif xor eax, eax .return: EPILOGUE_4_ARGS .div_zero: .div_overflow: mov eax, -1 jmp .return ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16 PROLOGUE_4_ARGS ; div by chainsaw check. test A2_32, A2_32 jz .div_zero ; Overflow check - unsigned division is simple to verify, haven't ; found a simple way to check signed division yet unfortunately. %if %4 == 0 cmp [A1], A2_32 jae .div_overflow %else push A2 ; save A2 so we modify it (we out of regs on x86). mov T0_32, [A0] ; T0 = dividend low mov T1_32, [A1] ; T1 = dividend high test A2_32, A2_32 js .divisor_negative test T1_32, T1_32 jns .both_positive call NAME(iemAImpl_negate_T0_T1_u32) .one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). push T0 ; Start off like unsigned below. shl T1_32, 1 shr T0_32, 31 or T1_32, T0_32 cmp T1_32, A2_32 pop T0 jb .div_no_overflow ja .div_overflow and T0_32, 0x7fffffff ; Special case for covering (divisor - 1). cmp T0_32, A2_32 jae .div_overflow jmp .div_no_overflow .divisor_negative: neg A2_32 test T1_32, T1_32 jns .one_of_each call NAME(iemAImpl_negate_T0_T1_u32) .both_positive: ; Same as unsigned shifted by sign indicator bit. shl T1_32, 1 shr T0_32, 31 or T1_32, T0_32 cmp T1_32, A2_32 jae .div_overflow .div_no_overflow: pop A2 %endif IEM_MAYBE_LOAD_FLAGS A3, %2, %3 mov eax, [A0] %ifdef ASM_CALL64_GCC mov T1, A2 mov eax, [A0] mov edx, [A1] %1 T1_32 mov [A0], eax mov [A1], edx %else mov T1, A1 mov eax, [A0] mov edx, [T1] %1 A2_32 mov [A0], eax mov [T1], edx %endif %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF. IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF %else IEM_SAVE_FLAGS A3, %2, %3 %endif xor eax, eax .return: EPILOGUE_4_ARGS .div_overflow: %if %4 != 0 pop A2 %endif .div_zero: mov eax, -1 jmp .return ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp. BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20 PROLOGUE_4_ARGS test A2, A2 jz .div_zero %if %4 == 0 cmp [A1], A2 jae .div_overflow %else push A2 ; save A2 so we modify it (we out of regs on x86). mov T0, [A0] ; T0 = dividend low mov T1, [A1] ; T1 = dividend high test A2, A2 js .divisor_negative test T1, T1 jns .both_positive call NAME(iemAImpl_negate_T0_T1_u64) .one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). push T0 ; Start off like unsigned below. shl T1, 1 shr T0, 63 or T1, T0 cmp T1, A2 pop T0 jb .div_no_overflow ja .div_overflow mov T1, 0x7fffffffffffffff and T0, T1 ; Special case for covering (divisor - 1). cmp T0, A2 jae .div_overflow jmp .div_no_overflow .divisor_negative: neg A2 test T1, T1 jns .one_of_each call NAME(iemAImpl_negate_T0_T1_u64) .both_positive: ; Same as unsigned shifted by sign indicator bit. shl T1, 1 shr T0, 63 or T1, T0 cmp T1, A2 jae .div_overflow .div_no_overflow: pop A2 %endif IEM_MAYBE_LOAD_FLAGS A3, %2, %3 mov rax, [A0] %ifdef ASM_CALL64_GCC mov T1, A2 mov rax, [A0] mov rdx, [A1] %1 T1 mov [A0], rax mov [A1], rdx %else mov T1, A1 mov rax, [A0] mov rdx, [T1] %1 A2 mov [A0], rax mov [T1], rdx %endif %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF. IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF %else IEM_SAVE_FLAGS A3, %2, %3 %endif xor eax, eax .return: EPILOGUE_4_ARGS_EX 12 .div_overflow: %if %4 != 0 pop A2 %endif .div_zero: mov eax, -1 jmp .return ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5 %endif ; !RT_ARCH_AMD64 %endmacro IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0 IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1 IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2 IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0 IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1 IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2 ;; ; Macro for implementing memory fence operation. ; ; No return value, no operands or anything. ; ; @param 1 The instruction. ; %macro IEMIMPL_MEM_FENCE 1 BEGINCODE BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0 %1 ret ENDPROC iemAImpl_ %+ %1 %endmacro IEMIMPL_MEM_FENCE lfence IEMIMPL_MEM_FENCE sfence IEMIMPL_MEM_FENCE mfence ;; ; Alternative for non-SSE2 host. ; BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0 push xAX xchg xAX, [xSP] add xSP, xCB ret ENDPROC iemAImpl_alt_mem_fence ;; ; Initialize the FPU for the actual instruction being emulated, this means ; loading parts of the guest's control word and status word. ; ; @uses 24 bytes of stack. T0, T1 ; @param 1 Expression giving the address of the FXSTATE of the guest. ; %macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1 fnstenv [xSP] ; FCW - for exception, precision and rounding control. movzx T0, word [%1 + X86FXSTATE.FCW] and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK mov [xSP + X86FSTENV32P.FCW], T0_16 ; FSW - for undefined C0, C1, C2, and C3. movzx T1, word [%1 + X86FXSTATE.FSW] and T1, X86_FSW_C_MASK movzx T0, word [xSP + X86FSTENV32P.FSW] and T0, X86_FSW_TOP_MASK or T0, T1 mov [xSP + X86FSTENV32P.FSW], T0_16 fldenv [xSP] %endmacro ;; ; Initialize the FPU for the actual instruction being emulated, this means ; loading parts of the guest's control word, status word, and update the ; tag word for the top register if it's empty. ; ; ASSUMES actual TOP=7 ; ; @uses 24 bytes of stack. T0, T1 ; @param 1 Expression giving the address of the FXSTATE of the guest. ; %macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1 fnstenv [xSP] ; FCW - for exception, precision and rounding control. movzx T0_32, word [%1 + X86FXSTATE.FCW] and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK mov [xSP + X86FSTENV32P.FCW], T0_16 ; FSW - for undefined C0, C1, C2, and C3. movzx T1_32, word [%1 + X86FXSTATE.FSW] and T1_32, X86_FSW_C_MASK movzx T0_32, word [xSP + X86FSTENV32P.FSW] and T0_32, X86_FSW_TOP_MASK or T0_32, T1_32 mov [xSP + X86FSTENV32P.FSW], T0_16 ; FTW - Only for ST0 (in/out). movzx T1_32, word [%1 + X86FXSTATE.FSW] shr T1_32, X86_FSW_TOP_SHIFT and T1_32, X86_FSW_TOP_SMASK bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order. jc %%st0_not_empty or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3 %%st0_not_empty: fldenv [xSP] %endmacro ;; ; Need to move this as well somewhere better? ; struc IEMFPURESULT .r80Result resw 5 .FSW resw 1 endstruc ;; ; Need to move this as well somewhere better? ; struc IEMFPURESULTTWO .r80Result1 resw 5 .FSW resw 1 .r80Result2 resw 5 endstruc ; ;---------------------- 16-bit signed integer operations ---------------------- ; ;; ; Converts a 16-bit floating point value to a 80-bit one (fpu register). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 16-bit floating point value to convert. ; BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fild word [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fild_r80_from_i16 ;; ; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory). ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 16-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fistp word [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fist_r80_to_i16 ;; ; Store a 80-bit floating point value (register) as a 16-bit signed integer ; (memory) with truncation. ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 16-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fisttp word [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fistt_r80_to_i16 ;; ; FPU instruction working on one 80-bit and one 16-bit signed integer value. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 16-bit value. ; %macro IEMIMPL_FPU_R80_BY_I16 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 word [A3] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16 %endmacro IEMIMPL_FPU_R80_BY_I16 fiadd IEMIMPL_FPU_R80_BY_I16 fimul IEMIMPL_FPU_R80_BY_I16 fisub IEMIMPL_FPU_R80_BY_I16 fisubr IEMIMPL_FPU_R80_BY_I16 fidiv IEMIMPL_FPU_R80_BY_I16 fidivr ;; ; FPU instruction working on one 80-bit and one 16-bit signed integer value, ; only returning FSW. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Where to store the output FSW. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 64-bit value. ; %macro IEMIMPL_FPU_R80_BY_I16_FSW 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 word [A3] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16 %endmacro IEMIMPL_FPU_R80_BY_I16_FSW ficom ; ;---------------------- 32-bit signed integer operations ---------------------- ; ;; ; Converts a 32-bit floating point value to a 80-bit one (fpu register). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 32-bit floating point value to convert. ; BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fild dword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fild_r80_from_i32 ;; ; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory). ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 32-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fistp dword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fist_r80_to_i32 ;; ; Store a 80-bit floating point value (register) as a 32-bit signed integer ; (memory) with truncation. ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 32-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fisttp dword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fistt_r80_to_i32 ;; ; FPU instruction working on one 80-bit and one 32-bit signed integer value. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 32-bit value. ; %macro IEMIMPL_FPU_R80_BY_I32 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 dword [A3] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32 %endmacro IEMIMPL_FPU_R80_BY_I32 fiadd IEMIMPL_FPU_R80_BY_I32 fimul IEMIMPL_FPU_R80_BY_I32 fisub IEMIMPL_FPU_R80_BY_I32 fisubr IEMIMPL_FPU_R80_BY_I32 fidiv IEMIMPL_FPU_R80_BY_I32 fidivr ;; ; FPU instruction working on one 80-bit and one 32-bit signed integer value, ; only returning FSW. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Where to store the output FSW. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 64-bit value. ; %macro IEMIMPL_FPU_R80_BY_I32_FSW 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 dword [A3] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32 %endmacro IEMIMPL_FPU_R80_BY_I32_FSW ficom ; ;---------------------- 64-bit signed integer operations ---------------------- ; ;; ; Converts a 64-bit floating point value to a 80-bit one (fpu register). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 64-bit floating point value to convert. ; BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fild qword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fild_r80_from_i64 ;; ; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory). ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 64-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fistp qword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fist_r80_to_i64 ;; ; Store a 80-bit floating point value (register) as a 64-bit signed integer ; (memory) with truncation. ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 64-bit signed integer value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fisttp qword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fistt_r80_to_i64 ; ;---------------------- 32-bit floating point operations ---------------------- ; ;; ; Converts a 32-bit floating point value to a 80-bit one (fpu register). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 32-bit floating point value to convert. ; BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fld dword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fld_r80_from_r32 ;; ; Store a 80-bit floating point value (register) as a 32-bit one (memory). ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 32-bit value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fst dword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fst_r80_to_r32 ;; ; FPU instruction working on one 80-bit and one 32-bit floating point value. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 32-bit value. ; %macro IEMIMPL_FPU_R80_BY_R32 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 dword [A3] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32 %endmacro IEMIMPL_FPU_R80_BY_R32 fadd IEMIMPL_FPU_R80_BY_R32 fmul IEMIMPL_FPU_R80_BY_R32 fsub IEMIMPL_FPU_R80_BY_R32 fsubr IEMIMPL_FPU_R80_BY_R32 fdiv IEMIMPL_FPU_R80_BY_R32 fdivr ;; ; FPU instruction working on one 80-bit and one 32-bit floating point value, ; only returning FSW. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Where to store the output FSW. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 64-bit value. ; %macro IEMIMPL_FPU_R80_BY_R32_FSW 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 dword [A3] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32 %endmacro IEMIMPL_FPU_R80_BY_R32_FSW fcom ; ;---------------------- 64-bit floating point operations ---------------------- ; ;; ; Converts a 64-bit floating point value to a 80-bit one (fpu register). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 64-bit floating point value to convert. ; BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fld qword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fld_r80_from_r64 ;; ; Store a 80-bit floating point value (register) as a 64-bit one (memory). ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 64-bit value. ; @param A3 Pointer to the 80-bit value. ; BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fst qword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fst_r80_to_r64 ;; ; FPU instruction working on one 80-bit and one 64-bit floating point value. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 64-bit value. ; %macro IEMIMPL_FPU_R80_BY_R64 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 qword [A3] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64 %endmacro IEMIMPL_FPU_R80_BY_R64 fadd IEMIMPL_FPU_R80_BY_R64 fmul IEMIMPL_FPU_R80_BY_R64 fsub IEMIMPL_FPU_R80_BY_R64 fsubr IEMIMPL_FPU_R80_BY_R64 fdiv IEMIMPL_FPU_R80_BY_R64 fdivr ;; ; FPU instruction working on one 80-bit and one 64-bit floating point value, ; only returning FSW. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Where to store the output FSW. ; @param A2 Pointer to the 80-bit value. ; @param A3 Pointer to the 64-bit value. ; %macro IEMIMPL_FPU_R80_BY_R64_FSW 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 qword [A3] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64 %endmacro IEMIMPL_FPU_R80_BY_R64_FSW fcom ; ;---------------------- 80-bit floating point operations ---------------------- ; ;; ; Loads a 80-bit floating point register value from memory. ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit floating point value to load. ; BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fld tword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fld_r80_from_r80 ;; ; Store a 80-bit floating point register to memory ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 80-bit value. ; @param A3 Pointer to the 80-bit register value. ; BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fstp tword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fst_r80_to_r80 ;; ; Loads an 80-bit floating point register value in BCD format from memory. ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit BCD value to load. ; BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fbld tword [A2] fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_fld_r80_from_d80 ;; ; Store a 80-bit floating point register to memory as BCD ; ; @param A0 FPU context (fxsave). ; @param A1 Where to return the output FSW. ; @param A2 Where to store the 80-bit BCD value. ; @param A3 Pointer to the 80-bit register value. ; BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 fbstp tword [A2] fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_fst_r80_to_d80 ;; ; FPU instruction working on two 80-bit floating point values. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the first 80-bit value (ST0) ; @param A3 Pointer to the second 80-bit value (STn). ; %macro IEMIMPL_FPU_R80_BY_R80 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 %2 fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 %endmacro IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1} IEMIMPL_FPU_R80_BY_R80 fprem, {} IEMIMPL_FPU_R80_BY_R80 fprem1, {} IEMIMPL_FPU_R80_BY_R80 fscale, {} ;; ; FPU instruction working on two 80-bit floating point values, ST1 and ST0, ; storing the result in ST1 and popping the stack. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the first 80-bit value (ST1). ; @param A3 Pointer to the second 80-bit value (ST0). ; %macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A2] fld tword [A3] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 %endmacro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1 ;; ; FPU instruction working on two 80-bit floating point values, only ; returning FSW. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a uint16_t for the resulting FSW. ; @param A2 Pointer to the first 80-bit value. ; @param A3 Pointer to the second 80-bit value. ; %macro IEMIMPL_FPU_R80_BY_R80_FSW 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 st0, st1 fnstsw word [A1] fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 %endmacro IEMIMPL_FPU_R80_BY_R80_FSW fcom IEMIMPL_FPU_R80_BY_R80_FSW fucom ;; ; FPU instruction working on two 80-bit floating point values, ; returning FSW and EFLAGS (eax). ; ; @param 1 The instruction ; ; @returns EFLAGS in EAX. ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a uint16_t for the resulting FSW. ; @param A2 Pointer to the first 80-bit value. ; @param A3 Pointer to the second 80-bit value. ; %macro IEMIMPL_FPU_R80_BY_R80_EFL 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 PROLOGUE_4_ARGS sub xSP, 20h fninit fld tword [A3] fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 st1 fnstsw word [A1] pushf pop xAX fninit add xSP, 20h EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 %endmacro IEMIMPL_FPU_R80_BY_R80_EFL fcomi IEMIMPL_FPU_R80_BY_R80_EFL fucomi ;; ; FPU instruction working on one 80-bit floating point value. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; @param A2 Pointer to the 80-bit value. ; %macro IEMIMPL_FPU_R80 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80 %endmacro IEMIMPL_FPU_R80 fchs IEMIMPL_FPU_R80 fabs IEMIMPL_FPU_R80 f2xm1 IEMIMPL_FPU_R80 fsqrt IEMIMPL_FPU_R80 frndint IEMIMPL_FPU_R80 fsin IEMIMPL_FPU_R80 fcos ;; ; FPU instruction working on one 80-bit floating point value, only ; returning FSW. ; ; @param 1 The instruction ; @param 2 Non-zero to also restore FTW. ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a uint16_t for the resulting FSW. ; @param A2 Pointer to the 80-bit value. ; %macro IEMIMPL_FPU_R80_FSW 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit fld tword [A2] %if %2 != 0 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0 %else FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %endif %1 fnstsw word [A1] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80 %endmacro IEMIMPL_FPU_R80_FSW ftst, 0 IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions. ;; ; FPU instruction loading a 80-bit floating point constant. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULT for the output. ; %macro IEMIMPL_FPU_R80_CONST 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8 PROLOGUE_2_ARGS sub xSP, 20h fninit FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 fnstsw word [A1 + IEMFPURESULT.FSW] fnclex fstp tword [A1 + IEMFPURESULT.r80Result] fninit add xSP, 20h EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ %endmacro IEMIMPL_FPU_R80_CONST fld1 IEMIMPL_FPU_R80_CONST fldl2t IEMIMPL_FPU_R80_CONST fldl2e IEMIMPL_FPU_R80_CONST fldpi IEMIMPL_FPU_R80_CONST fldlg2 IEMIMPL_FPU_R80_CONST fldln2 IEMIMPL_FPU_R80_CONST fldz ;; ; FPU instruction working on one 80-bit floating point value, outputing two. ; ; @param 1 The instruction ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to a IEMFPURESULTTWO for the output. ; @param A2 Pointer to the 80-bit value. ; %macro IEMIMPL_FPU_R80_R80 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12 PROLOGUE_3_ARGS sub xSP, 20h fninit fld tword [A2] FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 %1 fnstsw word [A1 + IEMFPURESULTTWO.FSW] fnclex fstp tword [A1 + IEMFPURESULTTWO.r80Result2] fnclex fstp tword [A1 + IEMFPURESULTTWO.r80Result1] fninit add xSP, 20h EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _r80_r80 %endmacro IEMIMPL_FPU_R80_R80 fptan IEMIMPL_FPU_R80_R80 fxtract IEMIMPL_FPU_R80_R80 fsincos ;---------------------- SSE and MMX Operations ---------------------- ;; @todo what do we need to do for MMX? %macro IEMIMPL_MMX_PROLOGUE 0 %endmacro %macro IEMIMPL_MMX_EPILOGUE 0 %endmacro ;; @todo what do we need to do for SSE? %macro IEMIMPL_SSE_PROLOGUE 0 %endmacro %macro IEMIMPL_SSE_EPILOGUE 0 %endmacro ;; @todo what do we need to do for AVX? %macro IEMIMPL_AVX_PROLOGUE 0 %endmacro %macro IEMIMPL_AVX_EPILOGUE 0 %endmacro ;; ; Media instruction working on two full sized registers. ; ; @param 1 The instruction ; @param 2 Whether there is an MMX variant (1) or not (0). ; ; @param A0 FPU context (fxsave). ; @param A1 Pointer to the first media register size operand (input/output). ; @param A2 Pointer to the second media register size operand (input). ; %macro IEMIMPL_MEDIA_F2 2 %if %2 != 0 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 PROLOGUE_3_ARGS IEMIMPL_MMX_PROLOGUE movq mm0, [A1] movq mm1, [A2] %1 mm0, mm1 movq [A1], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A1] movdqu xmm1, [A2] %1 xmm0, xmm1 movdqu [A1], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_F2 pshufb, 1 IEMIMPL_MEDIA_F2 pand, 1 IEMIMPL_MEDIA_F2 pandn, 1 IEMIMPL_MEDIA_F2 por, 1 IEMIMPL_MEDIA_F2 pxor, 1 IEMIMPL_MEDIA_F2 pcmpeqb, 1 IEMIMPL_MEDIA_F2 pcmpeqw, 1 IEMIMPL_MEDIA_F2 pcmpeqd, 1 IEMIMPL_MEDIA_F2 pcmpeqq, 0 IEMIMPL_MEDIA_F2 pcmpgtb, 1 IEMIMPL_MEDIA_F2 pcmpgtw, 1 IEMIMPL_MEDIA_F2 pcmpgtd, 1 IEMIMPL_MEDIA_F2 pcmpgtq, 0 IEMIMPL_MEDIA_F2 paddb, 1 IEMIMPL_MEDIA_F2 paddw, 1 IEMIMPL_MEDIA_F2 paddd, 1 IEMIMPL_MEDIA_F2 paddq, 1 IEMIMPL_MEDIA_F2 paddsb, 1 IEMIMPL_MEDIA_F2 paddsw, 1 IEMIMPL_MEDIA_F2 paddusb, 1 IEMIMPL_MEDIA_F2 paddusw, 1 IEMIMPL_MEDIA_F2 psubb, 1 IEMIMPL_MEDIA_F2 psubw, 1 IEMIMPL_MEDIA_F2 psubd, 1 IEMIMPL_MEDIA_F2 psubq, 1 IEMIMPL_MEDIA_F2 psubsb, 1 IEMIMPL_MEDIA_F2 psubsw, 1 IEMIMPL_MEDIA_F2 psubusb, 1 IEMIMPL_MEDIA_F2 psubusw, 1 IEMIMPL_MEDIA_F2 pmullw, 1 IEMIMPL_MEDIA_F2 pmulld, 0 IEMIMPL_MEDIA_F2 pmulhw, 1 IEMIMPL_MEDIA_F2 pmaddwd, 1 IEMIMPL_MEDIA_F2 pminub, 1 IEMIMPL_MEDIA_F2 pminuw, 0 IEMIMPL_MEDIA_F2 pminud, 0 IEMIMPL_MEDIA_F2 pminsb, 0 IEMIMPL_MEDIA_F2 pminsw, 1 IEMIMPL_MEDIA_F2 pminsd, 0 IEMIMPL_MEDIA_F2 pmaxub, 1 IEMIMPL_MEDIA_F2 pmaxuw, 0 IEMIMPL_MEDIA_F2 pmaxud, 0 IEMIMPL_MEDIA_F2 pmaxsb, 0 IEMIMPL_MEDIA_F2 pmaxsw, 1 IEMIMPL_MEDIA_F2 pmaxsd, 0 IEMIMPL_MEDIA_F2 pabsb, 1 IEMIMPL_MEDIA_F2 pabsw, 1 IEMIMPL_MEDIA_F2 pabsd, 1 IEMIMPL_MEDIA_F2 psignb, 1 IEMIMPL_MEDIA_F2 psignw, 1 IEMIMPL_MEDIA_F2 psignd, 1 IEMIMPL_MEDIA_F2 phaddw, 1 IEMIMPL_MEDIA_F2 phaddd, 1 IEMIMPL_MEDIA_F2 phsubw, 1 IEMIMPL_MEDIA_F2 phsubd, 1 IEMIMPL_MEDIA_F2 phaddsw, 1 IEMIMPL_MEDIA_F2 phsubsw, 1 IEMIMPL_MEDIA_F2 pmaddubsw, 1 IEMIMPL_MEDIA_F2 pmulhrsw, 1 IEMIMPL_MEDIA_F2 pmuludq, 1 ;; ; Media instruction working on two full sized registers, but no FXSAVE state argument. ; ; @param 1 The instruction ; @param 2 Whether there is an MMX variant (1) or not (0). ; ; @param A0 Pointer to the first media register size operand (input/output). ; @param A1 Pointer to the second media register size operand (input). ; %macro IEMIMPL_MEDIA_OPT_F2 2 %if %2 != 0 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8 PROLOGUE_2_ARGS IEMIMPL_MMX_PROLOGUE movq mm0, [A0] movq mm1, [A1] %1 mm0, mm1 movq [A0], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] %1 xmm0, xmm1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_OPT_F2 packsswb, 1 IEMIMPL_MEDIA_OPT_F2 packssdw, 1 IEMIMPL_MEDIA_OPT_F2 packuswb, 1 IEMIMPL_MEDIA_OPT_F2 packusdw, 0 IEMIMPL_MEDIA_OPT_F2 psllw, 1 IEMIMPL_MEDIA_OPT_F2 pslld, 1 IEMIMPL_MEDIA_OPT_F2 psllq, 1 IEMIMPL_MEDIA_OPT_F2 psrlw, 1 IEMIMPL_MEDIA_OPT_F2 psrld, 1 IEMIMPL_MEDIA_OPT_F2 psrlq, 1 IEMIMPL_MEDIA_OPT_F2 psraw, 1 IEMIMPL_MEDIA_OPT_F2 psrad, 1 IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1 IEMIMPL_MEDIA_OPT_F2 pavgb, 1 IEMIMPL_MEDIA_OPT_F2 pavgw, 1 IEMIMPL_MEDIA_OPT_F2 psadbw, 1 IEMIMPL_MEDIA_OPT_F2 pmuldq, 0 IEMIMPL_MEDIA_OPT_F2 unpcklps, 0 IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0 IEMIMPL_MEDIA_OPT_F2 unpckhps, 0 IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0 IEMIMPL_MEDIA_OPT_F2 phminposuw, 0 IEMIMPL_MEDIA_OPT_F2 aesimc, 0 IEMIMPL_MEDIA_OPT_F2 aesenc, 0 IEMIMPL_MEDIA_OPT_F2 aesdec, 0 IEMIMPL_MEDIA_OPT_F2 aesenclast, 0 IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0 IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0 IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0 IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0 IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0 IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0 ;; ; Media instruction working on one full sized and one half sized register (lower half). ; ; @param 1 The instruction ; @param 2 1 if MMX is included, 0 if not. ; ; @param A0 Pointer to the first full sized media register operand (input/output). ; @param A1 Pointer to the second half sized media register operand (input). ; %macro IEMIMPL_MEDIA_F1L1 2 %if %2 != 0 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8 PROLOGUE_2_ARGS IEMIMPL_MMX_PROLOGUE movq mm0, [A0] movq mm1, [A1] %1 mm0, mm1 movq [A0], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endif BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] %1 xmm0, xmm1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_F1L1 punpcklbw, 1 IEMIMPL_MEDIA_F1L1 punpcklwd, 1 IEMIMPL_MEDIA_F1L1 punpckldq, 1 IEMIMPL_MEDIA_F1L1 punpcklqdq, 0 ;; ; Media instruction working two half sized input registers (lower half) and a full sized ; destination register (vpunpckh*). ; ; @param 1 The instruction ; ; @param A0 Pointer to the destination register (full sized, output only). ; @param A1 Pointer to the first full sized media source register operand, where we ; will only use the lower half as input - but we'll be loading it in full. ; @param A2 Pointer to the second full sized media source register operand, where we ; will only use the lower half as input - but we'll be loading it in full. ; %macro IEMIMPL_MEDIA_F1L1L1 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_3_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu xmm0, [A1] vmovdqu xmm1, [A2] %1 xmm0, xmm0, xmm1 vmovdqu [A0], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12 PROLOGUE_3_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] vmovdqu ymm1, [A2] %1 ymm0, ymm0, ymm1 vmovdqu [A0], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_MEDIA_F1L1L1 vpunpcklbw IEMIMPL_MEDIA_F1L1L1 vpunpcklwd IEMIMPL_MEDIA_F1L1L1 vpunpckldq IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq ;; ; Media instruction working on one full sized and one half sized register (high half). ; ; @param 1 The instruction ; @param 2 1 if MMX is included, 0 if not. ; ; @param A0 Pointer to the first full sized media register operand (input/output). ; @param A1 Pointer to the second full sized media register operand, where we ; will only use the upper half as input - but we'll load it in full. ; %macro IEMIMPL_MEDIA_F1H1 2 IEMIMPL_MEDIA_F1L1 %1, %2 %endmacro IEMIMPL_MEDIA_F1L1 punpckhbw, 1 IEMIMPL_MEDIA_F1L1 punpckhwd, 1 IEMIMPL_MEDIA_F1L1 punpckhdq, 1 IEMIMPL_MEDIA_F1L1 punpckhqdq, 0 ;; ; Media instruction working two half sized input registers (high half) and a full sized ; destination register (vpunpckh*). ; ; @param 1 The instruction ; ; @param A0 Pointer to the destination register (full sized, output only). ; @param A1 Pointer to the first full sized media source register operand, where we ; will only use the upper half as input - but we'll be loading it in full. ; @param A2 Pointer to the second full sized media source register operand, where we ; will only use the upper half as input - but we'll be loading it in full. ; %macro IEMIMPL_MEDIA_F1H1H1 1 IEMIMPL_MEDIA_F1L1L1 %1 %endmacro IEMIMPL_MEDIA_F1H1H1 vpunpckhbw IEMIMPL_MEDIA_F1H1H1 vpunpckhwd IEMIMPL_MEDIA_F1H1H1 vpunpckhdq IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq ; ; Shufflers with evil 8-bit immediates. ; BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16 PROLOGUE_3_ARGS IEMIMPL_MMX_PROLOGUE movq mm1, [A1] movq mm0, mm0 ; paranoia! lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5 lea T1, [.imm0 xWrtRIP] lea T1, [T1 + T0] call T1 movq [A0], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pshufw mm0, mm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*5 == 0x500 dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pshufw_u64 %macro IEMIMPL_MEDIA_SSE_PSHUFXX 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm1, [A1] movdqu xmm0, xmm1 ; paranoia! lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw IEMIMPL_MEDIA_SSE_PSHUFXX pshufd %macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE vmovdqu ymm1, [A1] vmovdqu ymm0, ymm1 ; paranoia! lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 vmovdqu [A0], ymm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 ymm0, ymm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd ; ; Shifts with evil 8-bit immediates. ; %macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16 PROLOGUE_2_ARGS IEMIMPL_MMX_PROLOGUE movq mm0, [A0] lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5 lea T1, [.imm0 xWrtRIP] lea T1, [T1 + T0] call T1 movq [A0], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_2_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 mm0, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*5 == 0x500 dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _imm_u64 %endmacro IEMIMPL_MEDIA_MMX_PSHIFTXX psllw IEMIMPL_MEDIA_MMX_PSHIFTXX pslld IEMIMPL_MEDIA_MMX_PSHIFTXX psllq IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw IEMIMPL_MEDIA_MMX_PSHIFTXX psrld IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq IEMIMPL_MEDIA_MMX_PSHIFTXX psraw IEMIMPL_MEDIA_MMX_PSHIFTXX psrad %macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] lea T1, [.imm0 xWrtRIP] lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_2_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _imm_u128 %endmacro IEMIMPL_MEDIA_SSE_PSHIFTXX psllw IEMIMPL_MEDIA_SSE_PSHIFTXX pslld IEMIMPL_MEDIA_SSE_PSHIFTXX psllq IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw IEMIMPL_MEDIA_SSE_PSHIFTXX psrld IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq IEMIMPL_MEDIA_SSE_PSHIFTXX psraw IEMIMPL_MEDIA_SSE_PSHIFTXX psrad IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq ; ; Move byte mask. ; BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8 PROLOGUE_2_ARGS IEMIMPL_MMX_PROLOGUE movq mm1, [A1] pmovmskb T0, mm1 mov [A0], T0 %ifdef RT_ARCH_X86 mov dword [A0 + 4], 0 %endif IEMIMPL_MMX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_pmovmskb_u64 BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm1, [A1] pmovmskb T0, xmm1 mov [A0], T0 %ifdef RT_ARCH_X86 mov dword [A0 + 4], 0 %endif IEMIMPL_SSE_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_pmovmskb_u128 BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm1, [A1] vpmovmskb T0, ymm1 mov [A0], T0 %ifdef RT_ARCH_X86 mov dword [A0 + 4], 0 %endif IEMIMPL_AVX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_vpmovmskb_u256 ;; ; Media instruction working on two full sized source registers and one destination (AVX). ; ; @param 1 The instruction ; ; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA). ; @param A1 Pointer to the destination media register size operand (output). ; @param A2 Pointer to the first source media register size operand (input). ; @param A3 Pointer to the second source media register size operand (input). ; %macro IEMIMPL_MEDIA_F3 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu xmm0, [A2] vmovdqu xmm1, [A3] %1 xmm0, xmm0, xmm1 vmovdqu [A1], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A2] vmovdqu ymm1, [A3] %1 ymm0, ymm0, ymm1 vmovdqu [A1], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_MEDIA_F3 vpshufb IEMIMPL_MEDIA_F3 vpand IEMIMPL_MEDIA_F3 vpminub IEMIMPL_MEDIA_F3 vpminuw IEMIMPL_MEDIA_F3 vpminud IEMIMPL_MEDIA_F3 vpminsb IEMIMPL_MEDIA_F3 vpminsw IEMIMPL_MEDIA_F3 vpminsd IEMIMPL_MEDIA_F3 vpmaxub IEMIMPL_MEDIA_F3 vpmaxuw IEMIMPL_MEDIA_F3 vpmaxud IEMIMPL_MEDIA_F3 vpmaxsb IEMIMPL_MEDIA_F3 vpmaxsw IEMIMPL_MEDIA_F3 vpmaxsd IEMIMPL_MEDIA_F3 vpandn IEMIMPL_MEDIA_F3 vpor IEMIMPL_MEDIA_F3 vpxor IEMIMPL_MEDIA_F3 vpcmpeqb IEMIMPL_MEDIA_F3 vpcmpeqw IEMIMPL_MEDIA_F3 vpcmpeqd IEMIMPL_MEDIA_F3 vpcmpeqq IEMIMPL_MEDIA_F3 vpcmpgtb IEMIMPL_MEDIA_F3 vpcmpgtw IEMIMPL_MEDIA_F3 vpcmpgtd IEMIMPL_MEDIA_F3 vpcmpgtq IEMIMPL_MEDIA_F3 vpaddb IEMIMPL_MEDIA_F3 vpaddw IEMIMPL_MEDIA_F3 vpaddd IEMIMPL_MEDIA_F3 vpaddq IEMIMPL_MEDIA_F3 vpsubb IEMIMPL_MEDIA_F3 vpsubw IEMIMPL_MEDIA_F3 vpsubd IEMIMPL_MEDIA_F3 vpsubq ;; ; Media instruction working on two full sized source registers and one destination (AVX), ; but no XSAVE state pointer argument. ; ; @param 1 The instruction ; ; @param A0 Pointer to the destination media register size operand (output). ; @param A1 Pointer to the first source media register size operand (input). ; @param A2 Pointer to the second source media register size operand (input). ; %macro IEMIMPL_MEDIA_OPT_F3 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_3_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu xmm0, [A1] vmovdqu xmm1, [A2] %1 xmm0, xmm0, xmm1 vmovdqu [A0], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12 PROLOGUE_3_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] vmovdqu ymm1, [A2] %1 ymm0, ymm0, ymm1 vmovdqu [A0], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_MEDIA_OPT_F3 vpacksswb IEMIMPL_MEDIA_OPT_F3 vpackssdw IEMIMPL_MEDIA_OPT_F3 vpackuswb IEMIMPL_MEDIA_OPT_F3 vpackusdw IEMIMPL_MEDIA_OPT_F3 vpmullw IEMIMPL_MEDIA_OPT_F3 vpmulld IEMIMPL_MEDIA_OPT_F3 vpmulhw IEMIMPL_MEDIA_OPT_F3 vpmulhuw IEMIMPL_MEDIA_OPT_F3 vpavgb IEMIMPL_MEDIA_OPT_F3 vpavgw IEMIMPL_MEDIA_OPT_F3 vpsignb IEMIMPL_MEDIA_OPT_F3 vpsignw IEMIMPL_MEDIA_OPT_F3 vpsignd IEMIMPL_MEDIA_OPT_F3 vphaddw IEMIMPL_MEDIA_OPT_F3 vphaddd IEMIMPL_MEDIA_OPT_F3 vphsubw IEMIMPL_MEDIA_OPT_F3 vphsubd IEMIMPL_MEDIA_OPT_F3 vphaddsw IEMIMPL_MEDIA_OPT_F3 vphsubsw IEMIMPL_MEDIA_OPT_F3 vpmaddubsw IEMIMPL_MEDIA_OPT_F3 vpmulhrsw IEMIMPL_MEDIA_OPT_F3 vpsadbw IEMIMPL_MEDIA_OPT_F3 vpmuldq IEMIMPL_MEDIA_OPT_F3 vpmuludq IEMIMPL_MEDIA_OPT_F3 vunpcklps IEMIMPL_MEDIA_OPT_F3 vunpcklpd IEMIMPL_MEDIA_OPT_F3 vunpckhps IEMIMPL_MEDIA_OPT_F3 vunpckhpd ;; ; Media instruction working on one full sized source registers and one destination (AVX), ; but no XSAVE state pointer argument. ; ; @param 1 The instruction ; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0). ; ; @param A0 Pointer to the destination media register size operand (output). ; @param A1 Pointer to the source media register size operand (input). ; %macro IEMIMPL_MEDIA_OPT_F2_AVX 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu xmm0, [A1] %1 xmm0, xmm0 vmovdqu [A0], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %if %2 == 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] %1 ymm0, ymm0 vmovdqu [A0], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u256 %endif %endmacro IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1 IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1 IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1 IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0 ; ; The SSE 4.2 crc32 ; ; @param A1 Pointer to the 32-bit destination. ; @param A2 The source operand, sized according to the suffix. ; BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8 PROLOGUE_2_ARGS mov T0_32, [A0] crc32 T0_32, A1_8 mov [A0], T0_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_crc32_u8 BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8 PROLOGUE_2_ARGS mov T0_32, [A0] crc32 T0_32, A1_16 mov [A0], T0_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_crc32_u16 BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8 PROLOGUE_2_ARGS mov T0_32, [A0] crc32 T0_32, A1_32 mov [A0], T0_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_crc32_u32 %ifdef RT_ARCH_AMD64 BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8 PROLOGUE_2_ARGS mov T0_32, [A0] crc32 T0, A1 mov [A0], T0_32 EPILOGUE_2_ARGS ENDPROC iemAImpl_crc32_u64 %endif ; ; PTEST (SSE 4.1) ; ; @param A0 Pointer to the first source operand (aka readonly destination). ; @param A1 Pointer to the second source operand. ; @param A2 Pointer to the EFLAGS register. ; BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] ptest xmm0, xmm1 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ptest_u128 BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE vmovdqu ymm0, [A0] vmovdqu ymm1, [A1] vptest ymm0, ymm1 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_vptest_u256 ;; ; Template for the [v]pmov{s,z}x* instructions ; ; @param 1 The instruction ; ; @param A0 Pointer to the destination media register size operand (output). ; @param A1 The source operand value (input). ; %macro IEMIMPL_V_PMOV_SZ_X 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movd xmm0, A1 %1 xmm0, xmm0 vmovdqu [A0], xmm0 IEMIMPL_SSE_PROLOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE movd xmm0, A1 v %+ %1 xmm0, xmm0 vmovdqu [A0], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE movdqu xmm0, [A1] v %+ %1 ymm0, xmm0 vmovdqu [A0], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u256 %endmacro IEMIMPL_V_PMOV_SZ_X pmovsxbw IEMIMPL_V_PMOV_SZ_X pmovsxbd IEMIMPL_V_PMOV_SZ_X pmovsxbq IEMIMPL_V_PMOV_SZ_X pmovsxwd IEMIMPL_V_PMOV_SZ_X pmovsxwq IEMIMPL_V_PMOV_SZ_X pmovsxdq IEMIMPL_V_PMOV_SZ_X pmovzxbw IEMIMPL_V_PMOV_SZ_X pmovzxbd IEMIMPL_V_PMOV_SZ_X pmovzxbq IEMIMPL_V_PMOV_SZ_X pmovzxwd IEMIMPL_V_PMOV_SZ_X pmovzxwq IEMIMPL_V_PMOV_SZ_X pmovzxdq ;; ; Need to move this as well somewhere better? ; struc IEMSSERESULT .uResult resd 4 .MXCSR resd 1 endstruc ;; ; Need to move this as well somewhere better? ; struc IEMAVX128RESULT .uResult resd 4 .MXCSR resd 1 endstruc ;; ; Need to move this as well somewhere better? ; struc IEMAVX256RESULT .uResult resd 8 .MXCSR resd 1 endstruc ;; ; Initialize the SSE MXCSR register using the guest value partially to ; account for rounding mode. ; ; @uses 4 bytes of stack to save the original value, T0. ; @param 1 Expression giving the address of the FXSTATE of the guest. ; %macro SSE_LD_FXSTATE_MXCSR 1 sub xSP, 4 stmxcsr [xSP] mov T0_32, [%1 + X86FXSTATE.MXCSR] and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ or T0_32, X86_MXCSR_XCPT_MASK sub xSP, 4 mov [xSP], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Restores the SSE MXCSR register with the original value. ; ; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1. ; @param 1 Expression giving the address where to return the MXCSR value. ; @param 2 Expression giving the address of the FXSTATE of the guest. ; ; @note Restores the stack pointer. ; %macro SSE_ST_FXSTATE_MXCSR 2 sub xSP, 4 stmxcsr [xSP] mov T0_32, [xSP] add xSP, 4 ; Merge the status bits into the original MXCSR value. mov T1_32, [%2 + X86FXSTATE.MXCSR] and T0_32, X86_MXCSR_XCPT_FLAGS or T0_32, T1_32 mov [%1 + IEMSSERESULT.MXCSR], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Initialize the SSE MXCSR register using the guest value partially to ; account for rounding mode. ; ; @uses 4 bytes of stack to save the original value. ; @param 1 Expression giving the address of the FXSTATE of the guest. ; %macro AVX_LD_XSAVEAREA_MXCSR 1 sub xSP, 4 stmxcsr [xSP] mov T0_32, [%1 + X86FXSTATE.MXCSR] and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ sub xSP, 4 mov [xSP], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Restores the AVX128 MXCSR register with the original value. ; ; @param 1 Expression giving the address where to return the MXCSR value. ; ; @note Restores the stack pointer. ; %macro AVX128_ST_XSAVEAREA_MXCSR 1 stmxcsr [%1 + IEMAVX128RESULT.MXCSR] ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Restores the AVX256 MXCSR register with the original value. ; ; @param 1 Expression giving the address where to return the MXCSR value. ; ; @note Restores the stack pointer. ; %macro AVX256_ST_XSAVEAREA_MXCSR 1 stmxcsr [%1 + IEMAVX256RESULT.MXCSR] ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Floating point instruction working on two full sized registers. ; ; @param 1 The instruction ; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the result including the MXCSR value. ; @param A2 Pointer to the first media register size operand (input/output). ; @param A3 Pointer to the second media register size operand (input). ; %macro IEMIMPL_FP_F2 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 movdqu xmm0, [A2] movdqu xmm1, [A3] %1 xmm0, xmm1 movdqu [A1 + IEMSSERESULT.uResult], xmm0 SSE_ST_FXSTATE_MXCSR A1, A0 IEMIMPL_SSE_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %if %2 == 3 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu xmm0, [A2] vmovdqu xmm1, [A3] v %+ %1 xmm0, xmm0, xmm1 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0 AVX128_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu ymm0, [A2] vmovdqu ymm1, [A3] v %+ %1 ymm0, ymm0, ymm1 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0 AVX256_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u256 %elif %2 == 2 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu xmm0, [A2] vmovdqu xmm1, [A3] v %+ %1 xmm0, xmm1 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0 AVX128_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu ymm0, [A2] vmovdqu ymm1, [A3] v %+ %1 ymm0, ymm1 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0 AVX256_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u256 %endif %endmacro IEMIMPL_FP_F2 addps, 3 IEMIMPL_FP_F2 addpd, 3 IEMIMPL_FP_F2 mulps, 3 IEMIMPL_FP_F2 mulpd, 3 IEMIMPL_FP_F2 subps, 3 IEMIMPL_FP_F2 subpd, 3 IEMIMPL_FP_F2 minps, 3 IEMIMPL_FP_F2 minpd, 3 IEMIMPL_FP_F2 divps, 3 IEMIMPL_FP_F2 divpd, 3 IEMIMPL_FP_F2 maxps, 3 IEMIMPL_FP_F2 maxpd, 3 IEMIMPL_FP_F2 haddps, 3 IEMIMPL_FP_F2 haddpd, 3 IEMIMPL_FP_F2 hsubps, 3 IEMIMPL_FP_F2 hsubpd, 3 IEMIMPL_FP_F2 addsubps, 3 IEMIMPL_FP_F2 addsubpd, 3 ;; ; These are actually unary operations but to keep it simple ; we treat them as binary for now, so the output result is ; always in sync with the register where the result might get written ; to. IEMIMPL_FP_F2 sqrtps, 2 IEMIMPL_FP_F2 rsqrtps, 2 IEMIMPL_FP_F2 sqrtpd, 2 IEMIMPL_FP_F2 cvtdq2ps, 2 IEMIMPL_FP_F2 cvtps2dq, 2 IEMIMPL_FP_F2 cvttps2dq, 2 IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now ;; ; Floating point instruction working on a full sized register and a single precision operand. ; ; @param 1 The instruction ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the result including the MXCSR value. ; @param A2 Pointer to the first media register size operand (input/output). ; @param A3 Pointer to the second single precision floating point value (input). ; %macro IEMIMPL_FP_F2_R32 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 movdqu xmm0, [A2] movd xmm1, [A3] %1 xmm0, xmm1 movdqu [A1 + IEMSSERESULT.uResult], xmm0 SSE_ST_FXSTATE_MXCSR A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128_r32 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu xmm0, [A2] vmovd xmm1, [A3] v %+ %1 xmm0, xmm0, xmm1 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0 AVX128_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128_r32 %endmacro IEMIMPL_FP_F2_R32 addss IEMIMPL_FP_F2_R32 mulss IEMIMPL_FP_F2_R32 subss IEMIMPL_FP_F2_R32 minss IEMIMPL_FP_F2_R32 divss IEMIMPL_FP_F2_R32 maxss IEMIMPL_FP_F2_R32 cvtss2sd IEMIMPL_FP_F2_R32 sqrtss IEMIMPL_FP_F2_R32 rsqrtss ;; ; Floating point instruction working on a full sized register and a double precision operand. ; ; @param 1 The instruction ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the result including the MXCSR value. ; @param A2 Pointer to the first media register size operand (input/output). ; @param A3 Pointer to the second double precision floating point value (input). ; %macro IEMIMPL_FP_F2_R64 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 movdqu xmm0, [A2] movq xmm1, [A3] %1 xmm0, xmm1 movdqu [A1 + IEMSSERESULT.uResult], xmm0 SSE_ST_FXSTATE_MXCSR A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128_r64 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu xmm0, [A2] vmovq xmm1, [A3] v %+ %1 xmm0, xmm0, xmm1 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0 AVX128_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128_r64 %endmacro IEMIMPL_FP_F2_R64 addsd IEMIMPL_FP_F2_R64 mulsd IEMIMPL_FP_F2_R64 subsd IEMIMPL_FP_F2_R64 minsd IEMIMPL_FP_F2_R64 divsd IEMIMPL_FP_F2_R64 maxsd IEMIMPL_FP_F2_R64 cvtsd2ss IEMIMPL_FP_F2_R64 sqrtsd ;; ; Macro for the cvtpd2ps/cvtps2pd instructions. ; ; 1 The instruction name. ; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1). ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the result including the MXCSR value. ; @param A2 Pointer to the first media register size operand (input/output). ; @param A3 Pointer to the second media register size operand (input). ; %macro IEMIMPL_CVT_F2 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 movdqu xmm0, [A2] movdqu xmm1, [A3] %1 xmm0, xmm1 movdqu [A1 + IEMSSERESULT.uResult], xmm0 SSE_ST_FXSTATE_MXCSR A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu xmm0, [A2] vmovdqu xmm1, [A3] v %+ %1 xmm0, xmm1 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0 AVX128_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE AVX_LD_XSAVEAREA_MXCSR A0 vmovdqu ymm0, [A2] vmovdqu ymm1, [A3] %if %2 == 0 v %+ %1 xmm0, ymm1 %else v %+ %1 ymm0, xmm1 %endif vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0 AVX256_ST_XSAVEAREA_MXCSR A1 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_v %+ %1 %+ _u256 %endmacro IEMIMPL_CVT_F2 cvtpd2ps, 0 IEMIMPL_CVT_F2 cvtps2pd, 1 ;; ; shufps instructions with 8-bit immediates. ; ; @param A0 Pointer to the destination media register size operand (input/output). ; @param A1 Pointer to the first source media register size operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: shufps xmm0, xmm1, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_shufps_u128 ;; ; shufpd instruction with 8-bit immediates. ; ; @param A0 Pointer to the destination media register size operand (input/output). ; @param A1 Pointer to the first source media register size operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: shufpd xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_shufpd_u128 ;; ; vshufp{s,d} instructions with 8-bit immediates. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the destination media register size operand (output). ; @param A1 Pointer to the first source media register size operand (input). ; @param A2 Pointer to the second source media register size operand (input). ; @param A3 The 8-bit immediate ; %macro IEMIMPL_MEDIA_AVX_VSHUFPX 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE movdqu xmm0, [A1] movdqu xmm1, [A2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] vmovdqu ymm1, [A2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 vmovdqu [A0], ymm0 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 ymm0, ymm0, ymm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_MEDIA_AVX_VSHUFPX vshufps IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd ;; ; One of the [p]blendv{b,ps,pd} variants ; ; @param 1 The instruction ; ; @param A0 Pointer to the first media register sized operand (input/output). ; @param A1 Pointer to the second media sized value (input). ; @param A2 Pointer to the media register sized mask value (input). ; %macro IEMIMPL_P_BLEND 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A2] ; This is implicit movdqu xmm1, [A0] movdqu xmm2, [A1] ; @todo Do I need to save the original value here first? %1 xmm1, xmm2 movdqu [A0], xmm1 IEMIMPL_SSE_PROLOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_P_BLEND pblendvb IEMIMPL_P_BLEND blendvps IEMIMPL_P_BLEND blendvpd ;; ; One of the v[p]blendv{b,ps,pd} variants ; ; @param 1 The instruction ; ; @param A0 Pointer to the first media register sized operand (output). ; @param A1 Pointer to the first media register sized operand (input). ; @param A2 Pointer to the second media register sized operand (input). ; @param A3 Pointer to the media register sized mask value (input). %macro IEMIMPL_AVX_P_BLEND 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu xmm0, [A1] vmovdqu xmm1, [A2] vmovdqu xmm2, [A3] %1 xmm0, xmm0, xmm1, xmm2 vmovdqu [A0], xmm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] vmovdqu ymm1, [A2] vmovdqu ymm2, [A3] %1 ymm0, ymm0, ymm1, ymm2 vmovdqu [A0], ymm0 IEMIMPL_AVX_PROLOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u256 %endmacro IEMIMPL_AVX_P_BLEND vpblendvb IEMIMPL_AVX_P_BLEND vblendvps IEMIMPL_AVX_P_BLEND vblendvpd ;; ; palignr mm1, mm2/m64 instruction. ; ; @param A0 Pointer to the first media register sized operand (output). ; @param A1 The second register sized operand (input). ; @param A2 The 8-bit immediate. BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16 PROLOGUE_3_ARGS IEMIMPL_MMX_PROLOGUE movq mm0, [A0] movq mm1, A1 lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 movq [A0], mm0 IEMIMPL_MMX_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: palignr mm0, mm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_palignr_u64 ;; ; SSE instructions with 8-bit immediates of the form ; xxx xmm1, xmm2, imm8. ; where the instruction encoding takes up 6 bytes. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the first media register size operand (input/output). ; @param A1 Pointer to the second source media register size operand (input). ; @param A2 The 8-bit immediate ; %macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm1, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw ;; ; AVX instructions with 8-bit immediates of the form ; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8. ; where the instruction encoding takes up 6 bytes. ; ; @param 1 The instruction name. ; @param 2 Whether the instruction has a 256-bit variant (1) or not (0). ; ; @param A0 Pointer to the destination media register size operand (output). ; @param A1 Pointer to the first source media register size operand (input). ; @param A2 Pointer to the second source media register size operand (input). ; @param A3 The 8-bit immediate ; %macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE movdqu xmm0, [A1] movdqu xmm1, [A2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm0, xmm1, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 %if %2 == 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16 PROLOGUE_4_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] vmovdqu ymm1, [A2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] call T1 vmovdqu [A0], ymm0 IEMIMPL_AVX_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 ymm0, ymm0, ymm1, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u256 %endif %endmacro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1 IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1 IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1 IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1 IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0 ;; ; Need to move this as well somewhere better? ; struc IEMPCMPISTRXSRC .uSrc1 resd 4 .uSrc2 resd 4 endstruc struc IEMPCMPESTRXSRC .uSrc1 resd 4 .uSrc2 resd 4 .u64Rax resd 2 .u64Rdx resd 2 endstruc ;; ; The pcmpistri instruction. ; ; @param A0 Pointer to the ECX register to store the result to (output). ; @param A1 Pointer to the EFLAGS register. ; @param A2 Pointer to the structure containing the source operands (input). ; @param A3 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1] movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2] mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64) lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] call T1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 mov [T2], ecx IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pcmpistri xmm0, xmm1, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pcmpistri_u128 ;; ; The pcmpestri instruction. ; ; @param A0 Pointer to the ECX register to store the result to (output). ; @param A1 Pointer to the EFLAGS register. ; @param A2 Pointer to the structure containing the source operands (input). ; @param A3 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1] movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2] mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64) lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] push xDX ; xDX can be A1 or A2 depending on the calling convention mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx] call T1 pop xDX IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 mov [T2], ecx IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.) pcmpestri xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pcmpestri_u128 ;; ; The pcmpistrm instruction template. ; ; @param A0 Pointer to the XMM0 register to store the result to (output). ; @param A1 Pointer to the EFLAGS register. ; @param A2 Pointer to the structure containing the source operands (input). ; @param A3 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1] movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] call T1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pcmpistrm xmm1, xmm2, bImm ret int3 %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pcmpistrm_u128 ;; ; The pcmpestrm instruction template. ; ; @param A0 Pointer to the XMM0 register to store the result to (output). ; @param A1 Pointer to the EFLAGS register. ; @param A2 Pointer to the structure containing the source operands (input). ; @param A3 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1] movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2 lea T1, [T1 + T0*2] push xDX ; xDX can be A1 or A2 depending on the calling convention mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx] call T1 pop xDX IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.) pcmpestrm xmm1, xmm2, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*8 == 0x800 dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pcmpestrm_u128 ;; ; pinsrw instruction. ; ; @param A0 Pointer to the first media register size operand (input/output). ; @param A1 The 16 bit input operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movq mm0, [A0] lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5 lea T1, [.imm0 xWrtRIP] lea T1, [T1 + T0] call T1 movq [A0], mm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pinsrw mm0, A1_32, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*5 == 0x500 dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pinsrw_u64 BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pinsrw xmm0, A1_32, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pinsrw_u128 ;; ; vpinsrw instruction. ; ; @param A0 Pointer to the first media register size operand (output). ; @param A1 Pointer to the source media register size operand (input). ; @param A2 The 16 bit input operand (input). ; @param A3 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] mov A1, A2 ; A2 requires longer encoding on Windows call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: vpinsrw xmm0, xmm0, A1_32, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_vpinsrw_u128 ;; ; pextrw instruction. ; ; @param A0 Pointer to the 16bit output operand (output). ; @param A1 Pointer to the media register size operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movq mm0, A1 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5 lea T1, [.imm0 xWrtRIP] lea T1, [T1 + T0] call T1 mov word [A0], T0_16 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pextrw T0_32, mm0, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*5 == 0x500 dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pextrw_u64 BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 mov word [A0], T0_16 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: pextrw T0_32, xmm0, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_pextrw_u128 ;; ; vpextrw instruction. ; ; @param A0 Pointer to the 16bit output operand (output). ; @param A1 Pointer to the source media register size operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2 lea T1, [T1 + T0*2] call T1 mov word [A0], T0_16 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: vpextrw T0_32, xmm0, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_vpextrw_u128 ;; ; movmskp{s,d} SSE instruction template ; ; @param 1 The SSE instruction name. ; @param 2 The AVX instruction name. ; ; @param A0 Pointer to the output register (output/byte sized). ; @param A1 Pointer to the source media register size operand (input). ; %macro IEMIMPL_MEDIA_MOVMSK_P 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_2_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A1] %1 T0, xmm0 mov byte [A0], T0_8 IEMIMPL_SSE_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE movdqu xmm0, [A1] %2 T0, xmm0 mov byte [A0], T0_8 IEMIMPL_AVX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %2 %+ _u128 BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16 PROLOGUE_2_ARGS IEMIMPL_AVX_PROLOGUE vmovdqu ymm0, [A1] %2 T0, ymm0 mov byte [A0], T0_8 IEMIMPL_AVX_EPILOGUE EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %2 %+ _u256 %endmacro IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd ;; ; Restores the SSE MXCSR register with the original value. ; ; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1. ; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used. ; @param 2 Expression giving the address of the FXSTATE of the guest. ; ; @note Restores the stack pointer. ; %macro SSE_ST_FXSTATE_MXCSR_ONLY 2 sub xSP, 4 stmxcsr [xSP] mov T0_32, [xSP] add xSP, 4 ; Merge the status bits into the original MXCSR value. mov T1_32, [%2 + X86FXSTATE.MXCSR] and T0_32, X86_MXCSR_XCPT_FLAGS or T0_32, T1_32 mov [%1], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; cvttsd2si instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvttsd2si T0_32, [A3] mov dword [A2], T0_32 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvttsd2si_i32_r64 ;; ; cvttsd2si instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvttsd2si T0, [A3] mov qword [A2], T0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvttsd2si_i64_r64 ;; ; cvtsd2si instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsd2si T0_32, [A3] mov dword [A2], T0_32 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsd2si_i32_r64 ;; ; cvtsd2si instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsd2si T0, [A3] mov qword [A2], T0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsd2si_i64_r64 ;; ; cvttss2si instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvttss2si T0_32, [A3] mov dword [A2], T0_32 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvttss2si_i32_r32 ;; ; cvttss2si instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvttss2si T0, [A3] mov qword [A2], T0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvttss2si_i64_r32 ;; ; cvtss2si instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtss2si T0_32, [A3] mov dword [A2], T0_32 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtss2si_i32_r32 ;; ; cvtss2si instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtss2si T0, [A3] mov qword [A2], T0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtss2si_i64_r32 ;; ; cvtsi2ss instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsi2ss xmm0, dword [A3] movd dword [A2], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsi2ss_r32_i32 ;; ; cvtsi2ss instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsi2ss xmm0, qword [A3] movd dword [A2], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsi2ss_r32_i64 ;; ; cvtsi2sd instruction - 32-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsi2sd xmm0, dword [A3] movq [A2], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsi2sd_r64_i32 ;; ; cvtsi2sd instruction - 64-bit variant. ; ; @param A0 FPU context (FXSTATE or XSAVEAREA). ; @param A1 Where to return the MXCSR value. ; @param A2 Pointer to the result operand (output). ; @param A3 Pointer to the second operand (input). ; BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR A0 cvtsi2sd xmm0, qword [A3] movq [A2], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_cvtsi2sd_r64_i64 ;; ; Initialize the SSE MXCSR register using the guest value partially to ; account for rounding mode. ; ; @uses 4 bytes of stack to save the original value, T0. ; @param 1 Expression giving the address of the MXCSR register of the guest. ; %macro SSE_LD_FXSTATE_MXCSR_ONLY 1 sub xSP, 4 stmxcsr [xSP] mov T0_32, [%1] and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ or T0_32, X86_MXCSR_XCPT_MASK sub xSP, 4 mov [xSP], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ;; ; Restores the SSE MXCSR register with the original value. ; ; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1. ; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used. ; ; @note Restores the stack pointer. ; %macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1 sub xSP, 4 stmxcsr [xSP] mov T0_32, [xSP] add xSP, 4 ; Merge the status bits into the original MXCSR value. mov T1_32, [%1] and T0_32, X86_MXCSR_XCPT_FLAGS or T0_32, T1_32 mov [%1], T0_32 ldmxcsr [xSP] add xSP, 4 %endmacro ; ; UCOMISS (SSE) ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 Pointer to the first source operand (aka readonly destination). ; @param A3 Pointer to the second source operand. ; BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] ucomiss xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ucomiss_u128 BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] vucomiss xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_vucomiss_u128 ; ; UCOMISD (SSE) ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 Pointer to the first source operand (aka readonly destination). ; @param A3 Pointer to the second source operand. ; BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] ucomisd xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_ucomisd_u128 BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] vucomisd xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_vucomisd_u128 ; ; COMISS (SSE) ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 Pointer to the first source operand (aka readonly destination). ; @param A3 Pointer to the second source operand. ; BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] comiss xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_comiss_u128 BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] vcomiss xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_vcomiss_u128 ; ; COMISD (SSE) ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 Pointer to the first source operand (aka readonly destination). ; @param A3 Pointer to the second source operand. ; BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] comisd xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_comisd_u128 BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] movdqu xmm1, [A3] vcomisd xmm0, xmm1 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS ENDPROC iemAImpl_vcomisd_u128 ;; ; Need to move this as well somewhere better? ; struc IEMMEDIAF2XMMSRC .uSrc1 resd 4 .uSrc2 resd 4 endstruc ; ; CMPPS (SSE) ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first media register size operand (output). ; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input). ; @param A3 The 8-bit immediate (input). ; BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1] movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2] lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5 lea T1, [.imm0 xWrtRIP] lea T1, [T1 + T0] call T1 movdqu [A1], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: cmpps xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*5 == 0x500 dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_cmpps_u128 ;; ; SSE instructions with 8-bit immediates of the form ; xxx xmm1, xmm2, imm8. ; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR ; register. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first media register size operand (output). ; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input). ; @param A3 The 8-bit immediate (input). ; %macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1] movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2] lea T1, [.imm0 xWrtRIP] lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2 lea T1, [T1 + T0*2] call T1 movdqu [A1], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd ;; ; SSE instructions with 8-bit immediates of the form ; xxx xmm1, xmm2, imm8. ; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR ; register. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first media register size operand (output). ; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input). ; @param A3 The 8-bit immediate (input). ; %macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_4_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1] movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2] lea T1, [.imm0 xWrtRIP] lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3 lea T0, [T0*2] lea T0, [T0 + A3] lea T1, [T1 + T0] call T1 movdqu [A1], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_4_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: %1 xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*(6+1) == 0x700 dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd ;; ; SSE instructions of the form ; xxx mm, xmm. ; and we need to load and save the MXCSR register. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first MMX register sized operand (output). ; @param A2 Pointer to the media register sized operand (input). ; %macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A2] %1 mm0, xmm0 movq [A1], mm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi ;; ; SSE instructions of the form ; xxx xmm, xmm/m64. ; and we need to load and save the MXCSR register. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first media register sized operand (input/output). ; @param A2 The 64bit source value from a MMX media register (input) ; %macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movdqu xmm0, [A1] movq mm0, A2 %1 xmm0, mm0 movdqu [A1], xmm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd ;; ; SSE instructions of the form ; xxx mm, xmm/m64. ; and we need to load and save the MXCSR register. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the MXCSR value (input/output). ; @param A1 Pointer to the first MMX media register sized operand (output). ; @param A2 The 64bit source value (input). ; %macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE SSE_LD_FXSTATE_MXCSR_ONLY A0 movq xmm0, A2 %1 mm0, xmm0 movq [A1], mm0 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u128 %endmacro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi ; ; All forms of RDRAND and RDSEED ; ; @param A0 Pointer to the destination operand. ; @param A1 Pointer to the EFLAGS value (input/output). ; %macro IEMIMPL_RDRAND_RDSEED 3 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8 PROLOGUE_2_ARGS %1 %2 mov [A0], %2 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0 EPILOGUE_2_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3 %endmacro IEMIMPL_RDRAND_RDSEED rdrand, ax, 16 IEMIMPL_RDRAND_RDSEED rdrand, eax, 32 IEMIMPL_RDRAND_RDSEED rdrand, rax, 64 IEMIMPL_RDRAND_RDSEED rdseed, ax, 16 IEMIMPL_RDRAND_RDSEED rdseed, eax, 32 IEMIMPL_RDRAND_RDSEED rdseed, rax, 64 ;; ; sha1rnds4 xmm1, xmm2, imm8. ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the first media register size operand (input/output). ; @param A1 Pointer to the second source media register size operand (input). ; @param A2 The 8-bit immediate ; BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A0] movdqu xmm1, [A1] lea T1, [.imm0 xWrtRIP] lea T0, [A2 + A2*2] ; sizeof(insnX+ret) == 6: (A2 * 3) * 2 lea T1, [T1 + T0*2] call T1 movdqu [A0], xmm0 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS %assign bImm 0 %rep 256 .imm %+ bImm: sha1rnds4 xmm0, xmm1, bImm ret %assign bImm bImm + 1 %endrep .immEnd: ; 256*6 == 0x600 dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small. ENDPROC iemAImpl_sha1rnds4_u128 ;; ; sha256rnds2 xmm1, xmm2, . ; ; @param 1 The instruction name. ; ; @param A0 Pointer to the first media register size operand (input/output). ; @param A1 Pointer to the second source media register size operand (input). ; @param A2 Pointer to the implicit XMM0 constants (input). ; BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16 PROLOGUE_3_ARGS IEMIMPL_SSE_PROLOGUE movdqu xmm0, [A2] movdqu xmm1, [A0] movdqu xmm2, [A1] sha256rnds2 xmm1, xmm2 movdqu [A0], xmm1 IEMIMPL_SSE_EPILOGUE EPILOGUE_3_ARGS ENDPROC iemAImpl_sha256rnds2_u128 ; ; 32-bit forms of ADCX and ADOX ; ; @param A0 Pointer to the destination operand (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 32-bit source operand 1 (input). ; %macro IEMIMPL_ADX_32 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8 PROLOGUE_4_ARGS IEM_LOAD_FLAGS A1, %2, 0 %1 A2_32, [A0] mov [A0], A2_32 IEM_SAVE_FLAGS A1, %2, 0 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u32 %endmacro ; ; 64-bit forms of ADCX and ADOX ; ; @param A0 Pointer to the destination operand (input/output). ; @param A1 Pointer to the EFLAGS value (input/output). ; @param A2 64-bit source operand 1 (input). ; %macro IEMIMPL_ADX_64 2 BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8 PROLOGUE_4_ARGS IEM_LOAD_FLAGS A1, %2, 0 %1 A2, [A0] mov [A0], A2 IEM_SAVE_FLAGS A1, %2, 0 EPILOGUE_4_ARGS ENDPROC iemAImpl_ %+ %1 %+ _u64 %endmacro IEMIMPL_ADX_32 adcx, X86_EFL_CF IEMIMPL_ADX_64 adcx, X86_EFL_CF IEMIMPL_ADX_32 adox, X86_EFL_OF IEMIMPL_ADX_64 adox, X86_EFL_OF