Changeset 103003 in vbox
- Timestamp:
- Jan 23, 2024 4:19:17 PM (16 months ago)
- svn:sync-xref-src-repo-rev:
- 161235
- Location:
- trunk/src/VBox/VMM
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S
r102977 r103003 27 27 28 28 29 /********************************************************************************************************************************* 30 * Header Files * 31 *********************************************************************************************************************************/ 29 32 #include <iprt/asmdefs-arm.h> 33 #include <iprt/x86.h> 34 35 36 #if RT_CLANG_PREREQ(15, 0) 37 .arch_extension flagm /* not necessary */ 38 #else 39 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess. 40 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't 41 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm' 42 work for v15 and is enabled by default it seems. */ 43 .cpu apple-a14+crc 44 #endif 45 46 47 .macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0 48 /* 49 * Translate the arm NZCV bits into corresponding EFLAGS bits. 50 */ 51 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF 52 #if 0 53 /* Maybe just a tiny bit slow than the next one. */ 54 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */ 55 .ifeq \fSkipFlags & X86_EFL_OF 56 lsr \regTmp, \regTmp, #28 57 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1 58 lsr \regTmp, \regTmp, #1 59 .else 60 lsr \regTmp, \regTmp, #29 61 .endif 62 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */ 63 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */ 64 lsr \regTmp, \regTmp, #1 65 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */ 66 #else 67 /* This seems to be the faster one... */ 68 cfinv 69 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */ 70 .ifeq (\fSkipFlags & X86_EFL_OF) 71 lsr \regTmp, \regTmp, #28 72 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1 73 lsr \regTmp, \regTmp, #1 74 .else 75 lsr \regTmp, \regTmp, #29 76 .endif 77 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */ 78 lsr \regTmp, \regTmp, #1 79 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */ 80 #endif 81 .else 82 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */ 83 .ifeq \fSkipFlags & X86_EFL_ZF 84 cset \regTmp, eq 85 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1 86 .endif 87 .ifeq \fSkipFlags & X86_EFL_CF 88 cset \regTmp, cc 89 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 90 .endif 91 .ifeq \fSkipFlags & X86_EFL_OF 92 cset \regTmp, vs 93 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1 94 .endif 95 .ifeq \fSkipFlags & X86_EFL_SF 96 cset \regTmp, mi 97 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1 98 .endif 99 .endif 100 101 102 /* 103 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs). 104 */ 105 eor \regTmp, \regResult, \regResult, LSR #4 106 eor \regTmp, \regTmp, \regTmp, LSR #2 107 eor \regTmp, \regTmp, \regTmp, LSR #1 108 eor \regTmp, \regTmp, #1 109 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */ 110 111 /* 112 * Auxilary carry / borrow flag. This is related to 8-bit BCD. 113 */ 114 eor \regTmp, \regLeft, \regRight 115 eor \regTmp, \regTmp, \regResult 116 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT 117 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */ 118 119 /* done */ 120 .endm 30 121 31 122 … … 64 155 65 156 */ 157 158 159 /* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */ 160 161 /* 162 * The CMP instruction. 163 */ 164 165 /* void iemAImpl_cmp_u8(uint8_t const *puDst, uint8_t uSrc, uint32_t *pEFlags); */ 166 .p2align 2 167 .private_extern NAME(iemAImpl_sub_u8) 168 .globl NAME(iemAImpl_sub_u8) 169 NAME(iemAImpl_sub_u8): 170 .cfi_startproc 171 /* Do the subtraction. */ 172 ldrb w8, [x0] 173 /*and w1, w1, #0xff - should not be necessary. */ 174 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */ 175 setf8 w9 176 strb w9, [x0] 177 178 /* Load EFLAGS. */ 179 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 180 and w9, w9, #0xffff 181 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF 182 183 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to 184 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */ 185 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */ 186 eor w12, w8, w9 187 and w11, w12, w11 188 lsr w11, w11, #7 189 bfi w10, w11, #X86_EFL_OF_BIT, #1 190 191 /* Done with EFLAGS. */ 192 str w10, [x2] 193 ret 194 .cfi_endproc 195 196 197 /* void iemAImpl_cmp_u16(uint16_t const *puDst, uint16_t uSrc, uint32_t *pEFlags); */ 198 .p2align 2 199 .private_extern NAME(iemAImpl_sub_u16) 200 .globl NAME(iemAImpl_sub_u16) 201 NAME(iemAImpl_sub_u16): 202 .cfi_startproc 203 /* Do the subtraction. */ 204 ldrh w8, [x0] 205 /*and w1, w1, #0xffff - should not be necessary. */ 206 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */ 207 setf16 w9 208 strh w9, [x0] 209 210 /* Load EFLAGS. */ 211 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 212 and w9, w9, #0xffff 213 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF 214 215 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to 216 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */ 217 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */ 218 eor w12, w8, w9 219 and w11, w12, w11 220 lsr w11, w11, #15 221 bfi w10, w11, #X86_EFL_OF_BIT, #1 222 223 /* Done with EFLAGS. */ 224 str w10, [x2] 225 ret 226 .cfi_endproc 227 228 229 /* void iemAImpl_cmp_u32(uint32_t const *puDst, uint32_t uSrc, uint32_t *pEFlags); */ 230 .p2align 2 231 .private_extern NAME(iemAImpl_sub_u32) 232 .globl NAME(iemAImpl_sub_u32) 233 NAME(iemAImpl_sub_u32): 234 .cfi_startproc 235 /* Do the subtraction. */ 236 ldr w8, [x0] 237 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */ 238 str w9, [x0] 239 240 /* Load EFLAGS. */ 241 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 242 243 #if 0 244 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */ 245 #if 0 /* maybe just a tiny bit slow than the next one. */ 246 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */ 247 lsr w11, w11, #28 248 bfi w10, w11, #X86_EFL_OF_BIT, #1 249 lsr w11, w11, #1 250 eor w11, w11, #1 /* inverts the carry flag to x86 style. */ 251 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */ 252 lsr w11, w11, #1 253 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */ 254 #elif 1 /* seems the faster one... */ 255 cfinv 256 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */ 257 lsr w11, w11, #28 258 bfi w10, w11, #X86_EFL_OF_BIT, #1 259 lsr w11, w11, #1 260 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */ 261 lsr w11, w11, #1 262 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */ 263 #else 264 cset w11, eq 265 bfi w10, w11, #X86_EFL_ZF_BIT, #1 266 cset w11, cc 267 bfi w10, w11, #X86_EFL_CF_BIT, #1 268 cset w11, vs 269 bfi w10, w11, #X86_EFL_OF_BIT, #1 270 cset w11, mi 271 bfi w10, w11, #X86_EFL_SF_BIT, #1 272 #endif 273 274 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */ 275 eor w11, w9, w9, LSR #4 276 eor w11, w11, w11, LSR #2 277 eor w11, w11, w11, LSR #1 278 eor w11, w11, #1 279 bfi w10, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */ 280 281 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */ 282 eor w11, w8, w1 283 eor w11, w11, w9 284 lsr w11, w11, #X86_EFL_AF_BIT 285 bfi w10, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */ 286 #else 287 CALC_EFLAGS x10, x9, x8, x1, x11 288 #endif 289 290 str w10, [x2] 291 ret 292 .cfi_endproc 293 294 295 /* void iemAImpl_cmp_u64(uint64_t const *puDst, uint64_t uSrc, uint32_t *pEFlags); */ 296 .p2align 2 297 .private_extern NAME(iemAImpl_sub_u64) 298 .globl NAME(iemAImpl_sub_u64) 299 NAME(iemAImpl_sub_u64): 300 .cfi_startproc 301 /* Do the subtraction. */ 302 ldr x8, [x0] 303 subs x9, x8, x1 /* x9 = x8 (*puDst) - x1 (uSrc) */ 304 str x9, [x0] 305 306 /* Load EFLAGS. */ 307 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 308 CALC_EFLAGS x10, x9, x8, x1, x11 309 310 str w10, [x2] 311 ret 312 .cfi_endproc -
trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp
r102896 r103003 757 757 * SUB 758 758 */ 759 # if !defined(RT_ARCH_ARM64) 759 760 760 761 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags)) … … 766 767 } 767 768 768 # if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)769 # if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) 769 770 770 771 IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags)) … … 794 795 } 795 796 796 # endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */ 797 # endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */ 798 # endif /* !RT_ARCH_ARM64 */ 797 799 798 800 /* -
trunk/src/VBox/VMM/testcase/tstIEMAImpl.cpp
r102896 r103003 44 44 #include <iprt/string.h> 45 45 #include <iprt/test.h> 46 #include <iprt/time.h> 47 #include <iprt/thread.h> 46 48 #include <VBox/version.h> 47 49 … … 160 162 static const char *g_apszIncludeTestPatterns[64]; 161 163 static const char *g_apszExcludeTestPatterns[64]; 164 165 /** Higher value, means longer benchmarking. */ 166 static uint64_t g_cPicoSecBenchmark = 0; 162 167 163 168 static unsigned g_cVerbosity = 0; … … 1381 1386 #endif 1382 1387 1388 1389 /** Based on a quick probe run, guess how long to run the benchmark. */ 1390 static uint32_t EstimateIterations(uint32_t cProbeIterations, uint64_t cNsProbe) 1391 { 1392 uint64_t cPicoSecPerIteration = cNsProbe * 1000 / cProbeIterations; 1393 uint64_t cIterations = g_cPicoSecBenchmark / cPicoSecPerIteration; 1394 if (cIterations > _2G) 1395 return _2G; 1396 if (cIterations < _4K) 1397 return _4K; 1398 return RT_ALIGN_32((uint32_t)cIterations, _4K); 1399 } 1400 1401 1383 1402 #define TEST_BINARY_OPS(a_cBits, a_uType, a_Fmt, a_TestType, a_aSubTests) \ 1384 1403 GEN_BINARY_TESTS(a_cBits, a_Fmt, a_TestType) \ 1404 \ 1405 static uint64_t BinU ## a_cBits ## Bench(uint32_t cIterations, PFNIEMAIMPLBINU ## a_cBits pfn, a_TestType const *pEntry) \ 1406 { \ 1407 uint32_t const fEflIn = pEntry->fEflIn; \ 1408 a_uType const uDstIn = pEntry->uDstIn; \ 1409 a_uType const uSrcIn = pEntry->uSrcIn; \ 1410 cIterations /= 4; \ 1411 RTThreadYield(); \ 1412 uint64_t const nsStart = RTTimeNanoTS(); \ 1413 for (uint32_t i = 0; i < cIterations; i++) \ 1414 { \ 1415 uint32_t fBenchEfl = fEflIn; \ 1416 a_uType uBenchDst = uDstIn; \ 1417 pfn(&uBenchDst, uSrcIn, &fBenchEfl); \ 1418 \ 1419 fBenchEfl = fEflIn; \ 1420 uBenchDst = uDstIn; \ 1421 pfn(&uBenchDst, uSrcIn, &fBenchEfl); \ 1422 \ 1423 fBenchEfl = fEflIn; \ 1424 uBenchDst = uDstIn; \ 1425 pfn(&uBenchDst, uSrcIn, &fBenchEfl); \ 1426 \ 1427 fBenchEfl = fEflIn; \ 1428 uBenchDst = uDstIn; \ 1429 pfn(&uBenchDst, uSrcIn, &fBenchEfl); \ 1430 } \ 1431 return RTTimeNanoTS() - nsStart; \ 1432 } \ 1385 1433 \ 1386 1434 static void BinU ## a_cBits ## Test(void) \ … … 1393 1441 PFNIEMAIMPLBINU ## a_cBits pfn = a_aSubTests[iFn].pfn; \ 1394 1442 uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \ 1395 if (!cTests) RTTestSkipped(g_hTest, "no tests");\1443 if (!cTests) { RTTestSkipped(g_hTest, "no tests"); continue; } \ 1396 1444 for (uint32_t iVar = 0; iVar < cVars; iVar++) \ 1397 1445 { \ … … 1402 1450 pfn(&uDst, paTests[iTest].uSrcIn, &fEfl); \ 1403 1451 if ( uDst != paTests[iTest].uDstOut \ 1404 || fEfl != paTests[iTest].fEflOut ) \1452 || fEfl != paTests[iTest].fEflOut ) \ 1405 1453 RTTestFailed(g_hTest, "#%u%s: efl=%#08x dst=" a_Fmt " src=" a_Fmt " -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s - %s\n", \ 1406 1454 iTest, !iVar ? "" : "/n", paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uSrcIn, \ … … 1413 1461 *g_pfEfl = paTests[iTest].fEflIn; \ 1414 1462 pfn(g_pu ## a_cBits, paTests[iTest].uSrcIn, g_pfEfl); \ 1415 RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \1416 RTTEST_CHECK(g_hTest, *g_pfEfl 1463 RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \ 1464 RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \ 1417 1465 } \ 1418 1466 } \ 1467 \ 1468 /* Benchmark if all succeeded. */ \ 1469 if (g_cPicoSecBenchmark && RTTestSubErrorCount(g_hTest) == 0) \ 1470 { \ 1471 uint32_t const iTest = cTests / 2; \ 1472 uint32_t const cIterations = EstimateIterations(_64K, BinU ## a_cBits ## Bench(_64K, pfn, &paTests[iTest])); \ 1473 uint64_t const cNsRealRun = BinU ## a_cBits ## Bench(cIterations, pfn, &paTests[iTest]); \ 1474 RTTestValueF(g_hTest, cNsRealRun * 1000 / cIterations, RTTESTUNIT_PS_PER_CALL, \ 1475 "%s%s", a_aSubTests[iFn].pszName, iVar ? "-native" : ""); \ 1476 } \ 1477 \ 1478 /* Next variation is native. */ \ 1419 1479 pfn = a_aSubTests[iFn].pfnNative; \ 1420 1480 } \ … … 9390 9450 { "--generate", 'g', RTGETOPT_REQ_NOTHING }, 9391 9451 { "--test", 't', RTGETOPT_REQ_NOTHING }, 9452 { "--benchmark", 'b', RTGETOPT_REQ_NOTHING }, 9392 9453 // test selection (both) 9393 9454 { "--all", 'a', RTGETOPT_REQ_NOTHING }, … … 9423 9484 { 9424 9485 case 'g': 9425 enmMode = kModeGenerate; 9486 enmMode = kModeGenerate; 9487 g_cPicoSecBenchmark = 0; 9426 9488 break; 9427 9489 case 't': 9428 enmMode = kModeTest; 9490 enmMode = kModeTest; 9491 g_cPicoSecBenchmark = 0; 9492 break; 9493 case 'b': 9494 enmMode = kModeTest; 9495 g_cPicoSecBenchmark += RT_NS_1SEC / 2 * UINT64_C(1000); /* half a second in pico seconds */ 9429 9496 break; 9430 9497 … … 9517 9584 " -t, --test\n" 9518 9585 " Execute tests.\n" 9586 " -b, --benchmark\n" 9587 " Execute tests and do 1/2 seconds of benchmarking.\n" 9588 " Repeating the option increases the benchmark duration by 0.5 seconds.\n" 9519 9589 "\n" 9520 9590 "Test selection (both modes):\n"
Note:
See TracChangeset
for help on using the changeset viewer.