- Timestamp:
- Apr 11, 2024 1:03:03 PM (9 months ago)
- Location:
- trunk/src/VBox/VMM/VMMAll
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S
r104240 r104296 925 925 ROR_64 iemAImpl_ror_u64_amd, 0 926 926 927 928 /* 929 * Rotate Left thru Carry. 930 */ 931 932 /* uint32_t iemAImpl_rcl_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */ 933 /* uint32_t iemAImpl_rcl_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */ 934 /* uint32_t iemAImpl_rcl_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */ 935 .macro RCL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff 936 ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT) 937 BEGINPROC_HIDDEN \a_Name 938 .cfi_startproc 939 940 /* Do we need to rotate anything at all? */ 941 and w2, w2, #0x1f 942 cbz w2, 99f 943 944 .ifne \a_cBits < 32 945 /* 946 * 8 and 16 bit: w2 = w2 % (a_cBits + 1). 947 * 948 * Given that the w2 range is 0 thru 31, the 16-bit case can be reduced 949 * to: 950 * w2 = w2 >= 17 ? w2 - 17 : w2 951 * 952 * In the 8-bit scenario we're modding with 9, so we need to do it in 953 * two steps: 954 * w2 = w2 >= 18 ? w2 - 18 : w2 955 * w2 = w2 >= 9 ? w2 - 9 : w2 956 * 957 * For comparison clang generates the following for 16-bit: 958 * mov w9, #0xf0f0f0f1 959 * umull x9, w2, w9 960 * lsr x9, x9, #36 961 * bfi w9, w9, #4, #1 962 * sub w2, w2, w9 963 * 964 * The 8-bit variant is differs only in the constants used: 965 * mov w9, #0x38e38e39 966 * umull x9, w2, w9 967 * lsr x9, x9, #33 968 * bfi w9, w9, #3, #2 969 * subs w8, w2, w9 970 */ 971 mov w7, w2 972 973 .ifne \a_cBits == 16 974 subs w3, w2, #17 975 csel w2, w3, w2, hs 976 .else 977 subs w3, w2, #18 978 csel w2, w3, w2, hs 979 subs w3, w2, #9 980 csel w2, w3, w2, hs 981 .endif 982 .ifne \a_fIntelFlags 983 cbz w2, 99f /* Intel: Skip everything if the modded rotate count is zero. */ 984 .endif 985 .endif 986 987 /* 988 * Do the rotating: (w8 << w2) | (CF << (w2 - 1)) | (w2 > 1 ? (w8 >> (a_cBits - w2 + 1)) : 0) 989 */ 990 and w3, w0, #X86_EFL_CF 991 subs w4, w2, #1 /* Also: prep for 'w2 > 1' (w2 can't be zero, btw) - think: cmp w2, #1 */ 992 lslv x3, x3, x4 /* x3 = CF << (w2 - 1) */ 993 994 mov w4, #(\a_cBits + 1) 995 sub w4, w4, w2 /* w4 = a_cBits - w2 + 1 */ 996 997 ldr\a_LdStSuff w8, [x1] 998 lslv x9, x8, x2 999 lsrv w10, w8, w4 1000 csel w10, wzr, w10, eq /* if w2 == 1: w10 = 0; else: w10 = w8 >> (a_cBits - w2 + 1); */ 1001 orr x9, x9, x3 /* shifted CF */ 1002 orr x9, x9, x10 1003 str\a_LdStSuff w9, [x1] 1004 /* 1005 * Calculate EFLAGS - only CF and OF. 1006 */ 1007 .ifeq \a_fIntelFlags 1008 cbz w2, 88f /* AMD: CF doesn't change if the modded rotate count is zero (only OF does actually). */ 1009 .endif 1010 bfxil x0, x9, #(\a_cBits), #1 /* CF = last bit rotated out */ 1011 88: 1012 1013 .ifne \a_fIntelFlags 1014 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */ 1015 eor w11, w8, w8, LSL #1 1016 lsr w11, w11, #(\a_cBits - 1) 1017 bfi w0, w11, #X86_EFL_OF_BIT, #1 1018 .else 1019 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */ 1020 eor w11, w0, w9, LSR #(\a_cBits - 1) 1021 bfi w0, w11, #X86_EFL_OF_BIT, #1 1022 .endif 1023 1024 99: 1025 ret 1026 .cfi_endproc 1027 .endm 1028 1029 RCL_8_16_32 iemAImpl_rcl_u8, 8, 1, b 1030 RCL_8_16_32 iemAImpl_rcl_u8_intel, 8, 1, b 1031 RCL_8_16_32 iemAImpl_rcl_u8_amd, 8, 0, b 1032 1033 RCL_8_16_32 iemAImpl_rcl_u16, 16, 1, h 1034 RCL_8_16_32 iemAImpl_rcl_u16_intel, 16, 1, h 1035 RCL_8_16_32 iemAImpl_rcl_u16_amd, 16, 0, h 1036 1037 RCL_8_16_32 iemAImpl_rcl_u32, 32, 1, 1038 RCL_8_16_32 iemAImpl_rcl_u32_intel, 32, 1, 1039 RCL_8_16_32 iemAImpl_rcl_u32_amd, 32, 0, 1040 1041 /** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */ 1042 /* uint32_t iemAImpl_rcl_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */ 1043 .macro RCL_64, a_Name, a_fIntelFlags 1044 ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT) 1045 BEGINPROC_HIDDEN \a_Name 1046 .cfi_startproc 1047 1048 /* Do we need to shift anything at all? */ 1049 and w2, w2, #0x3f 1050 cbz w2, 99f 1051 1052 /* 1053 * Do the rotating: (w8 << w2) | (CF << (w2 - 1)) | (w2 > 1 ? (w8 >> (64 - w2 + 1)) : 0) 1054 */ 1055 and w3, w0, #X86_EFL_CF 1056 subs w4, w2, #1 /* Also: prep for 'w2 > 1' (w2 can't be zero, btw) - think: cmp w2, #1 */ 1057 lslv x3, x3, x4 /* x3 = CF << (w2 - 1) */ 1058 1059 mov w4, #(64 + 1) 1060 sub w4, w4, w2 /* w4 = 64 - w2 + 1 */ 1061 1062 ldr x8, [x1] 1063 lslv x9, x8, x2 1064 lsrv x10, x8, x4 1065 csel x10, xzr, x10, eq /* if w2 == 1: x10 = 0; else: x10 = x8 >> (64 - w2 + 1); */ 1066 orr x9, x9, x3 /* shifted CF */ 1067 orr x9, x9, x10 1068 str x9, [x1] 1069 1070 /* 1071 * Calculate EFLAGS - only CF and OF. 1072 */ 1073 neg x11, x2 1074 lsr x11, x8, x11 1075 bfi w0, w11, #0, #1 /* CF = last bit rotated out. */ 1076 1077 .ifne \a_fIntelFlags 1078 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */ 1079 eor x11, x8, x8, LSL #1 1080 lsr x11, x11, #(64 - 1) 1081 bfi w0, w11, #X86_EFL_OF_BIT, #1 1082 .else 1083 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */ 1084 eor x11, x0, x9, LSR #(64 - 1) 1085 bfi w0, w11, #X86_EFL_OF_BIT, #1 1086 .endif 1087 1088 99: 1089 ret 1090 .cfi_endproc 1091 .endm 1092 1093 RCL_64 iemAImpl_rcl_u64, 1 1094 RCL_64 iemAImpl_rcl_u64_intel, 1 1095 RCL_64 iemAImpl_rcl_u64_amd, 0 -
trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp
r104269 r104296 3253 3253 } 3254 3254 3255 #if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) 3255 #ifndef RT_ARCH_ARM64 3256 3257 # if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) 3256 3258 EMIT_RCL(64, uint64_t, RT_NOTHING, 1) 3257 # endif3259 # endif 3258 3260 EMIT_RCL(64, uint64_t, _intel, 1) 3259 3261 EMIT_RCL(64, uint64_t, _amd, 0) 3260 3262 3261 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)3263 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY) 3262 3264 EMIT_RCL(32, uint32_t, RT_NOTHING, 1) 3263 # endif3265 # endif 3264 3266 EMIT_RCL(32, uint32_t, _intel, 1) 3265 3267 EMIT_RCL(32, uint32_t, _amd, 0) 3266 3268 3267 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)3269 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY) 3268 3270 EMIT_RCL(16, uint16_t, RT_NOTHING, 1) 3269 # endif3271 # endif 3270 3272 EMIT_RCL(16, uint16_t, _intel, 1) 3271 3273 EMIT_RCL(16, uint16_t, _amd, 0) 3272 3274 3273 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)3275 # if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY) 3274 3276 EMIT_RCL(8, uint8_t, RT_NOTHING, 1) 3275 # endif3277 # endif 3276 3278 EMIT_RCL(8, uint8_t, _intel, 1) 3277 3279 EMIT_RCL(8, uint8_t, _amd, 0) 3280 3281 #endif /* !RT_ARCH_ARM64 */ 3278 3282 3279 3283
Note:
See TracChangeset
for help on using the changeset viewer.