Changeset 103003 in vbox

Timestamp:

Jan 23, 2024 4:19:17 PM (16 months ago)

Author:

vboxsync

svn:sync-xref-src-repo-rev:

161235

Message:

VMM/IEM: Assembly version of iemAImpl_sub_*. bugref:10376

Location:

trunk/src/VBox/VMM

Files:

: 3 edited

VMMAll/IEMAllAImpl-arm64.S (modified) (2 diffs)
VMMAll/IEMAllAImplC.cpp (modified) (3 diffs)
testcase/tstIEMAImpl.cpp (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S

-              r102977
+              r103003
+/*********************************************************************************************************************************
+*       Header Files                                                                                                             *
+*********************************************************************************************************************************/
 #include <iprt/asmdefs-arm.h>
+#include <iprt/x86.h>
+#if RT_CLANG_PREREQ(15, 0)
+        .arch_extension flagm   /* not necessary */
+#else
+        /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
+           For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
+           recognized, nor is the 'fmi' in the error message for cfinv.  'flagm'
+           work for v15 and is enabled by default it seems. */
+        .cpu            apple-a14+crc
+#endif
+.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
+        /*
+         * Translate the arm NZCV bits into corresponding EFLAGS bits.
+         */
+ .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
+#if 0
+        /* Maybe just a tiny bit slow than the next one. */
+        mrs     \regTmp, NZCV                           /* [31] = N; [30] = Z; [29] = C; [29] = V */
+  .ifeq \fSkipFlags & X86_EFL_OF
+        lsr     \regTmp, \regTmp, #28
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+        lsr     \regTmp, \regTmp, #1
+  .else
+        lsr     \regTmp, \regTmp, #29
+  .endif
+        eor     \regTmp, \regTmp, #1                    /* inverts the carry flag to x86 style. */
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#else
+        /* This seems to be the faster one... */
+        cfinv
+        mrs     \regTmp, NZCV                           /* [31] = N; [30] = Z; [29] = C; [29] = V */
+  .ifeq (\fSkipFlags & X86_EFL_OF)
+        lsr     \regTmp, \regTmp, #28
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+        lsr     \regTmp, \regTmp, #1
+  .else
+        lsr     \regTmp, \regTmp, #29
+  .endif
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#endif
+ .else
+        /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
+  .ifeq \fSkipFlags & X86_EFL_ZF
+        cset    \regTmp, eq
+        bfi     \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_CF
+        cset    \regTmp, cc
+        bfi     \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_OF
+        cset    \regTmp, vs
+        bfi     \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
+  .endif
+  .ifeq \fSkipFlags & X86_EFL_SF
+        cset    \regTmp, mi
+        bfi     \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
+  .endif
+ .endif
+        /*
+         * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
+         */
+        eor     \regTmp, \regResult, \regResult, LSR #4
+        eor     \regTmp, \regTmp, \regTmp, LSR #2
+        eor     \regTmp, \regTmp, \regTmp, LSR #1
+        eor     \regTmp, \regTmp, #1
+        bfi     \regEfl, \regTmp, #X86_EFL_PF_BIT, #1   /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
+        /*
+         * Auxilary carry / borrow flag.  This is related to 8-bit BCD.
+         */
+        eor     \regTmp, \regLeft, \regRight
+        eor     \regTmp, \regTmp, \regResult
+        lsr     \regTmp, \regTmp, #X86_EFL_AF_BIT
+        bfi     \regEfl, \regTmp, #X86_EFL_AF_BIT, #1   /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
+        /* done */
+.endm
 …
 */
+/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t  *pu8Mem,  uint8_t  *pu8Reg)); */
+/*
+ * The CMP instruction.
+ */
+/* void iemAImpl_cmp_u8(uint8_t const *puDst, uint8_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u8)
+        .globl          NAME(iemAImpl_sub_u8)
+NAME(iemAImpl_sub_u8):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldrb    w8, [x0]
+        /*and     w1, w1, #0xff - should not be necessary. */
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        setf8   w9
+        strb    w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        and     w9, w9, #0xffff
+        CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
+        /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
+           figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
+        eor     w11, w8, w1                     /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
+        eor     w12, w8, w9
+        and     w11, w12, w11
+        lsr     w11, w11, #7
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        /* Done with EFLAGS. */
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u16(uint16_t const *puDst, uint16_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u16)
+        .globl          NAME(iemAImpl_sub_u16)
+NAME(iemAImpl_sub_u16):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldrh    w8, [x0]
+        /*and     w1, w1, #0xffff - should not be necessary. */
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        setf16  w9
+        strh    w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        and     w9, w9, #0xffff
+        CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
+        /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
+           figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
+        eor     w11, w8, w1                     /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
+        eor     w12, w8, w9
+        and     w11, w12, w11
+        lsr     w11, w11, #15
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        /* Done with EFLAGS. */
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u32(uint32_t const *puDst, uint32_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u32)
+        .globl          NAME(iemAImpl_sub_u32)
+NAME(iemAImpl_sub_u32):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldr     w8, [x0]
+        subs    w9, w8, w1                      /* w9 = w8 (*puDst) - w1 (uSrc)  */
+        str     w9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+#if 0
+        /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
+#if 0   /* maybe just a tiny bit slow than the next one. */
+        mrs     x11, NZCV                       /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
+        lsr     w11, w11, #28
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        lsr     w11, w11, #1
+        eor     w11, w11, #1                    /* inverts the carry flag to x86 style. */
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#elif 1 /* seems the faster one... */
+        cfinv
+        mrs     x11, NZCV                       /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
+        lsr     w11, w11, #28
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1   /* CF(0) = C */
+        lsr     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #2   /* SF(7),ZF(6) = NZ */
+#else
+        cset    w11, eq
+        bfi     w10, w11, #X86_EFL_ZF_BIT, #1
+        cset    w11, cc
+        bfi     w10, w11, #X86_EFL_CF_BIT, #1
+        cset    w11, vs
+        bfi     w10, w11, #X86_EFL_OF_BIT, #1
+        cset    w11, mi
+        bfi     w10, w11, #X86_EFL_SF_BIT, #1
+#endif
+        /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
+        eor     w11, w9, w9, LSR #4
+        eor     w11, w11, w11, LSR #2
+        eor     w11, w11, w11, LSR #1
+        eor     w11, w11, #1
+        bfi     w10, w11, #X86_EFL_PF_BIT, #1   /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
+        /* Auxilary carry / borrow flag.  This is related to 8-bit BCD. */
+        eor     w11, w8, w1
+        eor     w11, w11, w9
+        lsr     w11, w11, #X86_EFL_AF_BIT
+        bfi     w10, w11, #X86_EFL_AF_BIT, #1   /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
+#else
+        CALC_EFLAGS x10, x9, x8, x1, x11
+#endif
+        str     w10, [x2]
+        ret
+        .cfi_endproc
+/* void iemAImpl_cmp_u64(uint64_t const *puDst, uint64_t uSrc, uint32_t *pEFlags); */
+        .p2align        2
+        .private_extern NAME(iemAImpl_sub_u64)
+        .globl          NAME(iemAImpl_sub_u64)
+NAME(iemAImpl_sub_u64):
+        .cfi_startproc
+        /* Do the subtraction. */
+        ldr     x8, [x0]
+        subs    x9, x8, x1                      /* x9 = x8 (*puDst) - x1 (uSrc)  */
+        str     x9, [x0]
+        /* Load EFLAGS. */
+        ldr     w10, [x2]                       /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
+        CALC_EFLAGS x10, x9, x8, x1, x11
+        str     w10, [x2]
+        ret
+        .cfi_endproc

trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp

-              r102896
+              r103003
  * SUB
  */
+# if !defined(RT_ARCH_ARM64)
 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
 …
+}
 # if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
+#  if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
 …
+}
+# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
+#  endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
+# endif /* !RT_ARCH_ARM64 */
 /*

trunk/src/VBox/VMM/testcase/tstIEMAImpl.cpp

-              r102896
+              r103003
 #include <iprt/string.h>
 #include <iprt/test.h>
+#include <iprt/time.h>
+#include <iprt/thread.h>
 #include <VBox/version.h>
 …
 static const char  *g_apszIncludeTestPatterns[64];
 static const char  *g_apszExcludeTestPatterns[64];
+/** Higher value, means longer benchmarking. */
+static uint64_t     g_cPicoSecBenchmark = 0;
 static unsigned     g_cVerbosity = 0;
 …
 #endif
+/** Based on a quick probe run, guess how long to run the benchmark. */
+static uint32_t EstimateIterations(uint32_t cProbeIterations, uint64_t cNsProbe)
+{
+    uint64_t cPicoSecPerIteration = cNsProbe * 1000 / cProbeIterations;
+    uint64_t cIterations = g_cPicoSecBenchmark / cPicoSecPerIteration;
+    if (cIterations > _2G)
+        return _2G;
+    if (cIterations < _4K)
+        return _4K;
+    return RT_ALIGN_32((uint32_t)cIterations, _4K);
+}
 #define TEST_BINARY_OPS(a_cBits, a_uType, a_Fmt, a_TestType, a_aSubTests) \
 GEN_BINARY_TESTS(a_cBits, a_Fmt, a_TestType) \
+\
+static uint64_t BinU ## a_cBits ## Bench(uint32_t cIterations, PFNIEMAIMPLBINU ## a_cBits pfn, a_TestType const *pEntry) \
+{ \
+    uint32_t const fEflIn = pEntry->fEflIn; \
+    a_uType  const uDstIn = pEntry->uDstIn; \
+    a_uType  const uSrcIn = pEntry->uSrcIn; \
+    cIterations /= 4; \
+    RTThreadYield(); \
+    uint64_t const nsStart     = RTTimeNanoTS(); \
+    for (uint32_t i = 0; i < cIterations; i++) \
+    { \
+        uint32_t fBenchEfl = fEflIn; \
+        a_uType  uBenchDst = uDstIn;  \
+        pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
+        \
+        fBenchEfl = fEflIn; \
+        uBenchDst = uDstIn;  \
+        pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
+        \
+        fBenchEfl = fEflIn; \
+        uBenchDst = uDstIn;  \
+        pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
+        \
+        fBenchEfl = fEflIn; \
+        uBenchDst = uDstIn;  \
+        pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
+    } \
+    return RTTimeNanoTS() - nsStart; \
+} \
+\
 static void BinU ## a_cBits ## Test(void) \
 …
         PFNIEMAIMPLBINU ## a_cBits pfn     = a_aSubTests[iFn].pfn; \
         uint32_t const             cVars   = COUNT_VARIATIONS(a_aSubTests[iFn]); \
         if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
+        if (!cTests) { RTTestSkipped(g_hTest, "no tests"); continue; } \
         for (uint32_t iVar = 0; iVar < cVars; iVar++) \
         { \
 …
                 pfn(&uDst, paTests[iTest].uSrcIn, &fEfl); \
                 if (   uDst != paTests[iTest].uDstOut \
                     || fEfl != paTests[iTest].fEflOut) \
+                    || fEfl != paTests[iTest].fEflOut ) \
                     RTTestFailed(g_hTest, "#%u%s: efl=%#08x dst=" a_Fmt " src=" a_Fmt " -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s - %s\n", \
                                  iTest, !iVar ? "" : "/n", paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uSrcIn, \
 …
                      *g_pfEfl = paTests[iTest].fEflIn; \
                      pfn(g_pu ## a_cBits, paTests[iTest].uSrcIn, g_pfEfl); \
                      RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \
                      RTTEST_CHECK(g_hTest, *g_pfEfl         == paTests[iTest].fEflOut); \
+                     RTTEST_CHECK(g_hTest, *g_pu ## a_cBits  == paTests[iTest].uDstOut); \
+                     RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \
                 } \
             } \
+            \
+            /* Benchmark if all succeeded. */ \
+            if (g_cPicoSecBenchmark && RTTestSubErrorCount(g_hTest) == 0) \
+            { \
+                uint32_t const iTest       = cTests / 2; \
+                uint32_t const cIterations = EstimateIterations(_64K, BinU ## a_cBits ## Bench(_64K, pfn, &paTests[iTest])); \
+                uint64_t const cNsRealRun  = BinU ## a_cBits ## Bench(cIterations, pfn, &paTests[iTest]); \
+                RTTestValueF(g_hTest, cNsRealRun * 1000 / cIterations, RTTESTUNIT_PS_PER_CALL, \
+                             "%s%s", a_aSubTests[iFn].pszName, iVar ? "-native" : ""); \
+            } \
+            \
+            /* Next variation is native. */ \
             pfn = a_aSubTests[iFn].pfnNative; \
         } \
 …
         { "--generate",             'g', RTGETOPT_REQ_NOTHING },
         { "--test",                 't', RTGETOPT_REQ_NOTHING },
+        { "--benchmark",            'b', RTGETOPT_REQ_NOTHING },
         // test selection (both)
         { "--all",                  'a', RTGETOPT_REQ_NOTHING },
 …
+        {
             case 'g':
+                enmMode     = kModeGenerate;
+                enmMode                 = kModeGenerate;
+                g_cPicoSecBenchmark     = 0;
                 break;
             case 't':
+                enmMode     = kModeTest;
+                enmMode                 = kModeTest;
+                g_cPicoSecBenchmark     = 0;
+                break;
+            case 'b':
+                enmMode                 = kModeTest;
+                g_cPicoSecBenchmark    += RT_NS_1SEC / 2 * UINT64_C(1000); /* half a second in pico seconds */
                 break;
 …
                          "  -t, --test\n"
                          "    Execute tests.\n"
+                         "  -b, --benchmark\n"
+                         "    Execute tests and do 1/2 seconds of benchmarking.\n"
+                         "    Repeating the option increases the benchmark duration by 0.5 seconds.\n"
                          "\n"
                          "Test selection (both modes):\n"

Note: See TracChangeset for help on using the changeset viewer.

Changeset 103003 in vbox

Legend:

trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S

trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp

trunk/src/VBox/VMM/testcase/tstIEMAImpl.cpp

Download in other formats: