Changeset 106297 in vbox
- Timestamp:
- Oct 12, 2024 1:07:27 AM (7 weeks ago)
- Location:
- trunk/src/VBox/VMM
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp
r106128 r106297 450 450 451 451 452 #if defined(VBOX_STRICT) || 0 453 /** 454 * The old bitmap scanner code, for comparison and assertions. 455 */ 456 static uint32_t iemExecMemAllocatorFindReqFreeUnitsOld(uint64_t *pbmAlloc, uint32_t cToScan, uint32_t cReqUnits) 457 { 458 /** @todo This can probably be done more efficiently for non-x86 systems. */ 459 int iBit = ASMBitFirstClear(pbmAlloc, cToScan); 460 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits) 461 { 462 uint32_t idxAddBit = 1; 463 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit)) 464 idxAddBit++; 465 if (idxAddBit >= cReqUnits) 466 return (uint32_t)iBit; 467 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1); 468 } 469 return UINT32_MAX; 470 } 471 #endif 472 473 474 /** 475 * Bitmap scanner code that looks for a bunch of @a cReqUnits zero bits. 476 * 477 * Booting win11 with a r165098 release build the average native TB size is 478 * around 9 units (of 256 bytes). So, it is unlikely we need to scan any 479 * subsequent words once we hit a patch of zeros, thus @a a_fBig. 480 * 481 * @todo This needs more tweaking. While it *is* faster the the old code, 482 * it doens't seem like it's all that much. :/ 483 */ 484 template<const bool a_fBig> 485 static uint32_t iemExecMemAllocatorFindReqFreeUnits(uint64_t *pbmAlloc, uint32_t c64WordsToScan, uint32_t cReqUnits) 486 { 487 /* 488 * Scan the (section of the) allocation bitmap in 64-bit words. 489 */ 490 unsigned cPrevLeadingZeros = 0; 491 for (uint32_t off = 0; off < c64WordsToScan; off++) 492 { 493 uint64_t uWord = pbmAlloc[off]; 494 if (uWord == UINT64_MAX) 495 { 496 do 497 { 498 off++; 499 if (off < c64WordsToScan) 500 uWord = pbmAlloc[off]; 501 else 502 return UINT32_MAX; 503 } while (uWord == UINT64_MAX); 504 505 cPrevLeadingZeros = 0; 506 } 507 508 if (uWord != 0) 509 { 510 /* 511 * Fend of large request we cannot satisfy before first. 512 */ 513 if (!a_fBig || cReqUnits < 64 + cPrevLeadingZeros) 514 { 515 #ifdef __GNUC__ 516 unsigned cZerosInWord = __builtin_popcountl(~uWord); 517 #else 518 # ifdef RT_ARCH_AMD64 519 unsigned cZerosInWord = __popcnt64(~uWords); 520 # else 521 # pragma message("need popcount intrinsic or something...") /** @todo port me: Win/ARM. */ 522 unsigned cZerosInWord = 0; 523 for (uint64_t uTmp = ~uWords; uTmp; cZerosInWord++) 524 uTmp &= uTmp - 1; /* Clears the least significant bit set. */ 525 # endif 526 #endif 527 if (cZerosInWord + cPrevLeadingZeros >= cReqUnits) 528 { 529 /* Check if we've got a patch of zeros at the trailing end 530 when joined with the previous word: */ 531 #ifdef __GNUC__ 532 unsigned cTrailingZeros = __builtin_ctzl(uWord); 533 #else 534 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1; 535 #endif 536 if (cPrevLeadingZeros + cTrailingZeros >= cReqUnits) 537 return off * 64 - cPrevLeadingZeros; 538 539 /* 540 * Try leading zeros before we get on with the tedious stuff. 541 */ 542 #ifdef __GNUC__ 543 cPrevLeadingZeros = __builtin_clzl(uWord); 544 #else 545 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord); 546 #endif 547 if (cPrevLeadingZeros >= cReqUnits) 548 return (off + 1) * 64 - cPrevLeadingZeros; 549 550 /* 551 * Check the popcount again sans leading & trailing before looking 552 * inside the word. 553 */ 554 cZerosInWord -= cPrevLeadingZeros + cTrailingZeros; 555 if (cZerosInWord >= cReqUnits) 556 { 557 /* 1; 64 - 0 - 1 = 63; */ 558 unsigned const iBitLast = 64 - cPrevLeadingZeros - cReqUnits; /** @todo boundrary */ 559 unsigned iBit = cTrailingZeros; 560 uWord >>= cTrailingZeros; 561 do 562 { 563 Assert(uWord & 1); 564 #ifdef __GNUC__ 565 unsigned iZeroBit = __builtin_ctzl(~uWord); 566 #else 567 unsigned iZeroBit = ASMBitFirstSetU64(~uWord) - 1; 568 #endif 569 iBit += iZeroBit; 570 uWord >>= iZeroBit; 571 Assert(iBit <= iBitLast); 572 Assert((uWord & 1) == 0); 573 #ifdef __GNUC__ 574 unsigned cZeros = __builtin_ctzl(uWord); 575 #else 576 unsigned cZeros = ASMBitFirstSetU64(uWord) - 1; 577 #endif 578 if (cZeros >= cReqUnits) 579 return off * 64 + iBit; 580 581 cZerosInWord -= cZeros; /* (may underflow as we will count shifted in zeros) */ 582 iBit += cZeros; 583 uWord >>= cZeros; 584 } while ((int)cZerosInWord >= (int)cReqUnits && iBit < iBitLast); 585 } 586 continue; /* we've already calculated cPrevLeadingZeros */ 587 } 588 } 589 590 /* Update the leading (MSB) zero count. */ 591 #ifdef __GNUC__ 592 cPrevLeadingZeros = __builtin_clzl(uWord); 593 #else 594 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord); 595 #endif 596 } 597 /* 598 * uWord == 0 599 */ 600 else 601 { 602 if RT_CONSTEXPR_IF(!a_fBig) 603 return off * 64 - cPrevLeadingZeros; 604 else 605 { 606 if (cPrevLeadingZeros + 64 >= cReqUnits) 607 return off * 64 - cPrevLeadingZeros; 608 for (uint32_t off2 = off + 1;; off2++) 609 { 610 if (off2 < c64WordsToScan) 611 { 612 uWord = pbmAlloc[off2]; 613 if (uWord == UINT64_MAX) 614 { 615 cPrevLeadingZeros = 0; 616 break; 617 } 618 if (uWord == 0) 619 { 620 if (cPrevLeadingZeros + (off2 - off + 1) * 64 >= cReqUnits) 621 return off * 64 - cPrevLeadingZeros; 622 } 623 else 624 { 625 #ifdef __GNUC__ 626 unsigned cTrailingZeros = uWord ? __builtin_ctzl(uWord) : 64; 627 #else 628 unsigned cTrailingZeros = uWord ? ASMBitFirstSetU64(uWord); 629 #endif 630 if (cPrevLeadingZeros + (off2 - off) * 64 + cTrailingZeros >= cReqUnits) 631 return off * 64 - cPrevLeadingZeros; 632 #ifdef __GNUC__ 633 cPrevLeadingZeros = __builtin_clzl(uWord); 634 #else 635 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord); 636 #endif 637 break; 638 } 639 } 640 else 641 return UINT32_MAX; 642 } 643 } 644 } 645 } 646 return UINT32_MAX; 647 } 648 649 452 650 /** 453 651 * Try allocate a block of @a cReqUnits in the chunk @a idxChunk. … … 465 663 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk); 466 664 pbmAlloc += idxFirst / 64; 467 468 /*469 * Scan the bitmap for cReqUnits of consequtive clear bits 470 */ 471 /** @todo This can probably be done more efficiently for non-x86 systems. */472 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);473 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)474 {475 uint32_t idxAddBit = 1; 476 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))477 idxAddBit++; 478 if (idxAddBit >= cReqUnits)479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 665 cToScan += idxFirst & 63; 666 Assert(!(cToScan & 63)); 667 668 #if 1 669 uint32_t const iBit = cReqUnits < 64 670 ? iemExecMemAllocatorFindReqFreeUnits<false>(pbmAlloc, cToScan / 64, cReqUnits) 671 : iemExecMemAllocatorFindReqFreeUnits<true>( pbmAlloc, cToScan / 64, cReqUnits); 672 Assert(iBit == iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits)); 673 #else 674 uint32_t const iBit = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits); 675 #endif 676 if (iBit != UINT32_MAX) 677 { 678 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits); 679 680 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk]; 681 pChunk->cFreeUnits -= cReqUnits; 682 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits; 683 684 pExecMemAllocator->cAllocations += 1; 685 uint32_t const cbReq = cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT; 686 pExecMemAllocator->cbAllocated += cbReq; 687 pExecMemAllocator->cbFree -= cbReq; 688 pExecMemAllocator->idxChunkHint = idxChunk; 689 690 void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw 691 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT); 692 693 if (ppChunkCtx) 694 *ppChunkCtx = pChunk->pCtx; 695 696 /* 697 * Initialize the header and return. 698 */ 501 699 # ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER 502 503 504 505 506 507 508 509 510 511 512 700 PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw; 701 pHdr->uMagic = IEMEXECMEMALLOCHDR_MAGIC; 702 pHdr->idxChunk = idxChunk; 703 pHdr->pTb = pTb; 704 705 if (ppvExec) 706 *ppvExec = (uint8_t *)pChunk->pvChunkRx 707 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) 708 + sizeof(*pHdr); 709 710 return pHdr + 1; 513 711 #else 514 if (ppvExec) 515 *ppvExec = (uint8_t *)pChunk->pvChunkRx 516 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT); 517 518 RT_NOREF(pTb); 519 return pvMem; 520 #endif 521 } 522 523 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1); 524 } 712 if (ppvExec) 713 *ppvExec = (uint8_t *)pChunk->pvChunkRx 714 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT); 715 716 RT_NOREF(pTb); 717 return pvMem; 718 #endif 719 } 720 525 721 return NULL; 526 722 } … … 669 865 #ifdef RT_OS_DARWIN 670 866 /* 671 * Flush the instruction cache: 672 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon 673 */ 674 /* sys_dcache_flush(pv, cb); - not necessary */ 675 sys_icache_invalidate(pv, cb); 676 RT_NOREF(pVCpu); 867 * We need to synchronize the stuff we wrote to the data cache with the 868 * instruction cache, since these aren't coherent on arm (or at least not 869 * on Apple Mn CPUs). 870 * 871 * Note! Since we don't any share JIT'ed code with the other CPUs, we don't 872 * really care whether the dcache is fully flushed back to memory. It 873 * only needs to hit the level 2 cache, which the level 1 instruction 874 * and data caches seems to be sharing. In ARM terms, we need to reach 875 * a point of unification (PoU), rather than a point of coherhency (PoC). 876 * 877 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon 878 * 879 * https://developer.arm.com/documentation/den0013/d/Caches/Point-of-coherency-and-unification 880 * 881 * Experimenting with the approach used by sys_icache_invalidate() and 882 * tweaking it a little, could let us shave off a bit of effort. The thing 883 * that slows the apple code down on an M2 (runing Sonoma 13.4), seems to 884 * the 'DSB ISH' instructions performed every 20 icache line flushes. 885 * Skipping these saves ~100ns or more per TB when profiling the native 886 * recompiler on the TBs from a win11 full boot-desktop-shutdow sequence. 887 * Thus we will leave DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB undefined if we 888 * can. 889 * 890 * There appears not to be much difference between DSB options 'ISH', 891 * 'ISHST', 'NSH' and 'NSHST'. The latter is theoretically all we need, so 892 * we'll use that one. 893 * 894 * See https://developer.arm.com/documentation/100941/0101/Barriers for 895 * details on the barrier options. 896 * 897 * Note! The CFG value "/IEM/HostICacheInvalidationViaHostAPI" can be used 898 * to disabling the experimental code should it misbehave. 899 */ 900 uint8_t const fHostICacheInvalidation = pVCpu->iem.s.fHostICacheInvalidation; 901 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_USE_HOST_API)) 902 { 903 # define DCACHE_ICACHE_SYNC_DSB_OPTION "nshst" 904 /*# define DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB*/ 905 906 /* Skipping this is fine, but doesn't impact perf much. */ 907 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION); 908 909 /* Invalidate the icache for the range [pv,pv+cb). */ 910 # ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB 911 size_t const cIvauDsbEvery= 20; 912 unsigned cDsb = cIvauDsbEvery; 913 # endif 914 size_t const cbCacheLine = 64; 915 size_t cbInvalidate = cb + ((uintptr_t)pv & (cbCacheLine - 1)) ; 916 size_t cCacheLines = RT_ALIGN_Z(cbInvalidate, cbCacheLine) / cbCacheLine; 917 uintptr_t uPtr = (uintptr_t)pv & ~(uintptr_t)(cbCacheLine - 1); 918 for (;; uPtr += cbCacheLine) 919 { 920 __asm__ /*__volatile__*/("ic ivau, %0" : : "r" (uPtr)); 921 cCacheLines -= 1; 922 if (!cCacheLines) 923 break; 924 # ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB 925 cDsb -= 1; 926 if (cDsb != 0) 927 { /* likely */ } 928 else 929 { 930 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION); 931 cDsb = cIvauDsbEvery; 932 } 933 # endif 934 } 935 936 /* 937 * The DSB here is non-optional it seems. 938 * 939 * The following ISB can be omitted on M2 without any obvious sideeffects, 940 * it produces better number in the above mention profiling scenario. 941 * This could be related to the kHasICDSB flag in cpu_capabilities.h, 942 * but it doesn't look like that flag is set here (M2, Sonoma 13.4). 943 * 944 * I've made the inclusion of the ISH barrier as configurable and with 945 * a default of skipping it. 946 */ 947 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_END_WITH_ISH)) 948 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION 949 ::: "memory"); 950 else 951 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION "\n\t" 952 "isb" 953 ::: "memory"); 954 } 955 else 956 sys_icache_invalidate(pv, cb); 677 957 678 958 #elif defined(RT_OS_LINUX) && defined(RT_ARCH_ARM64) -
trunk/src/VBox/VMM/VMMR3/IEMR3.cpp
r106212 r106297 197 197 AssertLogRelRCReturn(rc, rc); 198 198 199 /** @cfgm{/IEM/HostICacheInvalidationViaHostAPI, bool, false} 200 * Whether to use any available host OS API for flushing the instruction cache 201 * after completing an translation block. */ 202 bool fFlag = false; 203 rc = CFGMR3QueryBoolDef(pIem, "HostICacheInvalidationViaHostAPI", &fFlag, false); 204 AssertLogRelRCReturn(rc, rc); 205 uint8_t fHostICacheInvalidation = fFlag ? IEMNATIVE_ICACHE_F_USE_HOST_API : 0; 206 207 /** @cfgm{/IEM/HostICacheInvalidationEndWithIsb, bool, false} 208 * Whether to include an ISB in the instruction cache invalidation sequence 209 * after completing an translation block. */ 210 fFlag = false; 211 rc = CFGMR3QueryBoolDef(pIem, "HostICacheInvalidationEndWithIsb", &fFlag, false); 212 AssertLogRelRCReturn(rc, rc); 213 if (fFlag) 214 fHostICacheInvalidation |= IEMNATIVE_ICACHE_F_END_WITH_ISH; 215 199 216 #endif /* VBOX_WITH_IEM_RECOMPILER*/ 200 217 … … 301 318 pVCpu->iem.s.uRegFpCtrl = IEMNATIVE_SIMD_FP_CTRL_REG_NOT_MODIFIED; 302 319 pVCpu->iem.s.uTbNativeRecompileAtUsedCount = uTbNativeRecompileAtUsedCount; 320 pVCpu->iem.s.fHostICacheInvalidation = fHostICacheInvalidation; 303 321 #endif 304 322 -
trunk/src/VBox/VMM/include/IEMInternal.h
r106212 r106297 2123 2123 uint64_t u64Placeholder; 2124 2124 #endif 2125 /** 2126 * Whether we should use the host instruction invalidation APIs of the 2127 * host OS or our own version of it (macOS). */ 2128 uint8_t fHostICacheInvalidation; 2129 #define IEMNATIVE_ICACHE_F_USE_HOST_API UINT8_C(0x01) /**< Use the host API (macOS) instead of our code. */ 2130 #define IEMNATIVE_ICACHE_F_END_WITH_ISH UINT8_C(0x02) /**< Whether to end with a ISH barrier (arm). */ 2131 bool afRecompilerStuff2[7]; 2125 2132 /** @} */ 2126 2133 … … 2395 2402 2396 2403 #ifdef IEM_WITH_TLB_TRACE 2397 uint64_t au64Padding[1];2404 /*uint64_t au64Padding[0];*/ 2398 2405 #else 2399 uint64_t au64Padding[ 3];2406 uint64_t au64Padding[2]; 2400 2407 #endif 2401 2408
Note:
See TracChangeset
for help on using the changeset viewer.