VMMAll

Timestamp:

Jan 23, 2024 4:19:17 PM (14 months ago)

Author:

vboxsync

svn:sync-xref-src-repo-rev:

161235

Message:

VMM/IEM: Assembly version of iemAImpl_sub_*. bugref:10376

Location:

trunk/src/VBox/VMM/VMMAll

Files:

: 2 edited

IEMAllAImpl-arm64.S (modified) (2 diffs)
IEMAllAImplC.cpp (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S

-              r102977
+              r103003
+/*********************************************************************************************************************************
+*       Header Files                                                                                                             *
+*********************************************************************************************************************************/
 #include <iprt/asmdefs-arm.h>
+#include <iprt/x86.h>
+#if RT_CLANG_PREREQ(15, 0)
+        .arch_extension flagm   /* not necessary */
+#else
+        /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
+           For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
+           recognized, nor is the 'fmi' in the error message for cfinv.  'flagm'
+           work for v15 and is enabled by default it seems. */
+        .cpu            apple-a14+crc
+#endif
+.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
+        /*
+         * Translate the arm NZCV bits into corresponding EFLAGS bits.
+         */
+ .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
+#if 0
+        /* Maybe just a tiny bit slow than the next one. */
+        mrs     \regTmp, NZCV                           /* [31] = N; [30] = Z; [29] = C; [29] = V */
+  .ifeq \fSkipFlags & X86_EFL_OF
+        lsr     \regTmp, \regTmp, #28
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+        lsr     \regTmp, \regTmp, #1
+  .else
+        lsr     \regTmp, \regTmp, #29
+  .endif
+        eor     \regTmp, \regTmp, #1                    /* inverts the carry flag to x86 style. */
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#else
+        /* This seems to be the faster one... */
+        cfinv
+        mrs     \regTmp, NZCV                           /* [31] = N; [30] = Z; [29] = C; [29] = V */
+  .ifeq (\fSkipFlags & X86_EFL_OF)
+        lsr     \regTmp, \regTmp, #28
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+        lsr     \regTmp, \regTmp, #1
+  .else
+        lsr     \regTmp, \regTmp, #29
+  .endif
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#endif
+ .else
+        /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
+  .ifeq \fSkipFlags & X86_EFL_ZF
+        cset    \regTmp, eq
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_CF
+        cset    \regTmp, cc
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_OF
+        cset    \regTmp, vs
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_SF
+        cset    \regTmp, mi
+        bfi     \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
+  .endif
+ .endif
+        /*
+         * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
+         */
+        eor     \regTmp, \regResult, \regResult, LSR #4
+        eor     \regTmp, \regTmp, \regTmp, LSR #2
+        eor     \regTmp, \regTmp, \regTmp, LSR #1
+        eor     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_PF_BIT, #1   /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
+        /*
+         * Auxilary carry / borrow flag.  This is related to 8-bit BCD.
+         */
+        eor     \regTmp, \regLeft, \regRight
+        eor     \regTmp, \regTmp, \regResult
+        lsr     \regTmp, \regTmp, #X86_EFL_AF_BIT
+        bfi     \regEfl, \regTmp, #X86_EFL_AF_BIT, #1   /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
+        /* done */
+.endm
 …
 */
+/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t  *pu8Mem,  uint8_t  *pu8Reg)); */
+/*
+ * The CMP instruction.
+ */
+/* void iemAImpl_cmp_u8(uint8_t const *puDst, uint8_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u8)
+        .globl          NAME(iemAImpl_sub_u8)
+NAME(iemAImpl_sub_u8):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldrb    w8, [x0]
+        /*and     w1, w1, #0xff - should not be necessary. */
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        setf8   w9
+        strb    w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        and     w9, w9, #0xffff
+        CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
+        /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
+           figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
+        eor     w11, w8, w1                     /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
+        eor     w12, w8, w9
+        and     w11, w12, w11
+        lsr     w11, w11, #7
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        /* Done with EFLAGS. */
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u16(uint16_t const *puDst, uint16_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u16)
+        .globl          NAME(iemAImpl_sub_u16)
+NAME(iemAImpl_sub_u16):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldrh    w8, [x0]
+        /*and     w1, w1, #0xffff - should not be necessary. */
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        setf16  w9
+        strh    w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        and     w9, w9, #0xffff
+        CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
+        /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
+           figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
+        eor     w11, w8, w1                     /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
+        eor     w12, w8, w9
+        and     w11, w12, w11
+        lsr     w11, w11, #15
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        /* Done with EFLAGS. */
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u32(uint32_t const *puDst, uint32_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u32)
+        .globl          NAME(iemAImpl_sub_u32)
+NAME(iemAImpl_sub_u32):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldr     w8, [x0]
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        str     w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+#if 0
+        /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
+#if 0   /* maybe just a tiny bit slow than the next one. */
+        mrs     x11, NZCV                       /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
+        lsr     w11, w11, #28
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        lsr     w11, w11, #1
+        eor     w11, w11, #1                    /* inverts the carry flag to x86 style. */
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#elif 1 /* seems the faster one... */
+        cfinv
+        mrs     x11, NZCV                       /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
+        lsr     w11, w11, #28
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#else
+        cset    w11, eq
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #1
+        cset    w11, cc
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1
+        cset    w11, vs
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        cset    w11, mi
+        bfi     w10, w11, #X86_EFL_SF_BIT, #1
+#endif
+        /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
+        eor     w11, w9, w9, LSR #4
+        eor     w11, w11, w11, LSR #2
+        eor     w11, w11, w11, LSR #1
+        eor     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_PF_BIT, #1   /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
+        /* Auxilary carry / borrow flag.  This is related to 8-bit BCD. */
+        eor     w11, w8, w1
+        eor     w11, w11, w9
+        lsr     w11, w11, #X86_EFL_AF_BIT
+        bfi     w10, w11, #X86_EFL_AF_BIT, #1   /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
+#else
+        CALC_EFLAGS x10, x9, x8, x1, x11
+#endif
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u64(uint64_t const *puDst, uint64_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u64)
+        .globl          NAME(iemAImpl_sub_u64)
+NAME(iemAImpl_sub_u64):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldr     x8, [x0]
+        subs    x9, x8, x1                      /* x9 = x8 (*puDst) - x1 (uSrc)  */
+        str     x9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        CALC_EFLAGS x10, x9, x8, x1, x11
+        str     w10, [x2]
+        ret
+        .cfi_endproc

trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp

-              r102896
+              r103003
  * SUB
  */
+# if !defined(RT_ARCH_ARM64)
 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
 …
+}
 # if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
+#  if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
 …
+}
+# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
+#  endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
+# endif /* !RT_ARCH_ARM64 */
 /*

Note: See TracChangeset for help on using the changeset viewer.

Changeset 103003 in vbox for trunk/src/VBox/VMM/VMMAll

Legend:

trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S

trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp

Download in other formats: