VirtualBox

Changeset 104858 in vbox


Ignore:
Timestamp:
Jun 5, 2024 6:10:20 PM (8 months ago)
Author:
vboxsync
Message:

VMM/IEM: Optimize executable memory allocation on macOS by removing the need for calling RTMemProtect() to switch between RW and RX memory, bugref:10555.

On macOS it is impossible to have memory allocated RWX which is the reason for the current RTMemProtect() trickery which induces additional overhead. However the mach
VM API allows remapping the physical memory backed by a virtual address into another region which can have different protection flags. This allows having a virtual
memory region with readable/writeable permissions and a second region with readable/executable permissions both backed by the same physical memory.
A profiling build before this optimization took 76 ticks on average (taken before any memory pruning started to take place because the maximum amount of executable
memory was reached) when allocating executable memory, which translates to 3166.7ns given the 24MHz frequency of CNTVCT_EL0 used as the time source.
With the optimization in place the average is now 15 ticks, or 625ns.

Location:
trunk/src/VBox/VMM
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp

    r104731 r104858  
    7272# if defined(RT_OS_DARWIN)
    7373#  include <libkern/OSCacheControl.h>
     74#  include <mach/mach.h>
     75#  include <mach/mach_vm.h>
    7476#  define IEMNATIVE_USE_LIBUNWIND
    7577extern "C" void  __register_frame(const void *pvFde);
     
    220222    /** Hint were to start searching for free space in the allocation bitmap. */
    221223    uint32_t                idxFreeHint;
    222     /** Pointer to the chunk. */
    223     void                   *pvChunk;
     224    /** Pointer to the readable/writeable view of the memory chunk. */
     225    void                   *pvChunkRw;
     226    /** Pointer to the readable/executable view of the memory chunk. */
     227    void                   *pvChunkRx;
    224228#ifdef IN_RING3
    225229    /**
     
    412416     */
    413417    uint64_t            cbPruned = 0;
    414     uint8_t * const     pbChunk  = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunk;
     418    uint8_t * const     pbChunk  = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
    415419    while (offChunk < offPruneEnd)
    416420    {
     
    427431            AssertPtr(pTb);
    428432
    429             /* We now have to check that this isn't a old freed header, given
    430                that we don't invalidate the header upon free because of darwin
    431                restrictions on executable memory (iemExecMemAllocatorFree).
    432                This relies upon iemTbAllocatorFreeInner resetting TB members. */
    433             if (   pTb->Native.paInstructions == (PIEMNATIVEINSTR)(pHdr + 1)
    434                 && (pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
    435             {
    436                 uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
    437                                                      IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
    438                 AssertBreakStmt(offChunk + cbBlock <= cbChunk, offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE); /* paranoia */
    439 
    440                 iemTbAllocatorFree(pVCpu, pTb);
    441 
    442                 cbPruned += cbBlock;
    443                 offChunk += cbBlock;
    444             }
    445             else
    446                 offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
     433            uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
     434                                                 IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
     435            AssertBreakStmt(offChunk + cbBlock <= cbChunk, offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE); /* paranoia */
     436
     437            iemTbAllocatorFree(pVCpu, pTb);
     438
     439            cbPruned += cbBlock;
     440            offChunk += cbBlock;
    447441        }
    448442        else
     
    466460 */
    467461static void *iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
    468                                                 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb)
     462                                                uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb, void **ppvExec)
    469463{
    470464    /*
     
    500494            pExecMemAllocator->idxChunkHint  = idxChunk;
    501495
    502             void * const pvMem = (uint8_t *)pChunk->pvChunk
    503                                + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
    504 #ifdef RT_OS_DARWIN
    505             /*
    506              * Sucks, but RTMEM_PROT_EXEC and RTMEM_PROT_WRITE are mutually exclusive
    507              * on darwin.  So, we mark the pages returned as read+write after alloc and
    508              * expect the caller to call iemExecMemAllocatorReadyForUse when done
    509              * writing to the allocation.
    510              *
    511              * See also https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
    512              * for details.
    513              */
    514             /** @todo detect if this is necessary... it wasn't required on 10.15 or
    515              *        whatever older version it was. */
    516             int rc = RTMemProtect(pvMem, cbReq, RTMEM_PROT_WRITE | RTMEM_PROT_READ);
    517             AssertRC(rc);
    518 #endif
     496            void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw
     497                                 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
    519498
    520499            /*
     
    522501             */
    523502# ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
    524             PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMem;
     503            PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw;
    525504            pHdr->uMagic   = IEMEXECMEMALLOCHDR_MAGIC;
    526505            pHdr->idxChunk = idxChunk;
    527506            pHdr->pTb      = pTb;
     507
     508            if (ppvExec)
     509                *ppvExec = (uint8_t *)pChunk->pvChunkRx
     510                         + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT)
     511                         + sizeof(*pHdr);
     512
    528513            return pHdr + 1;
    529514#else
     515            if (ppvExec)
     516                *ppvExec = (uint8_t *)pChunk->pvChunkRx
     517                         + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
     518
    530519            RT_NOREF(pTb);
    531520            return pvMem;
     
    540529
    541530static void *
    542 iemExecMemAllocatorAllocInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq, PIEMTB pTb)
     531iemExecMemAllocatorAllocInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq, PIEMTB pTb, void **ppvExec)
    543532{
    544533    /*
     
    559548            void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
    560549                                                             pExecMemAllocator->cUnitsPerChunk - idxHint,
    561                                                              cReqUnits, idxChunk, pTb);
     550                                                             cReqUnits, idxChunk, pTb, ppvExec);
    562551            if (pvRet)
    563552                return pvRet;
     
    565554        return iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
    566555                                                  RT_MIN(pExecMemAllocator->cUnitsPerChunk, RT_ALIGN_32(idxHint + cReqUnits, 64)),
    567                                                   cReqUnits, idxChunk, pTb);
     556                                                  cReqUnits, idxChunk, pTb, ppvExec);
    568557    }
    569558    return NULL;
     
    574563 * Allocates @a cbReq bytes of executable memory.
    575564 *
    576  * @returns Pointer to the memory, NULL if out of memory or other problem
     565 * @returns Pointer to the readable/writeable memory, NULL if out of memory or other problem
    577566 *          encountered.
    578567 * @param   pVCpu   The cross context virtual CPU structure of the calling
     
    580569 * @param   cbReq   How many bytes are required.
    581570 * @param   pTb     The translation block that will be using the allocation.
    582  */
    583 DECLHIDDEN(void *) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb) RT_NOEXCEPT
     571 * @param   ppvExec Where to return the pointer to executable view of the allocated memory, optional.
     572 */
     573DECLHIDDEN(void *) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb, void **ppvExec) RT_NOEXCEPT
    584574{
    585575    PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
     
    596586            for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
    597587            {
    598                 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb);
     588                void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb, ppvExec);
    599589                if (pvRet)
    600590                {
     
    605595            for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
    606596            {
    607                 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb);
     597                void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb, ppvExec);
    608598                if (pvRet)
    609599                {
     
    623613
    624614            uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
    625             void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb);
     615            void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb, ppvExec);
    626616            if (pvRet)
    627617            {
     
    655645
    656646
    657 /** This is a hook that we may need later for changing memory protection back
    658  *  to readonly+exec */
     647/** This is a hook to ensure the instruction cache is properly flushed before the code in the memory
     648 * given by @a pv and @a cb is executed */
    659649DECLHIDDEN(void) iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT
    660650{
    661651#ifdef RT_OS_DARWIN
    662     /* See iemExecMemAllocatorAllocInChunkInt for the explanation. */
    663     int rc = RTMemProtect(pv, cb, RTMEM_PROT_EXEC | RTMEM_PROT_READ);
    664     AssertRC(rc); RT_NOREF(pVCpu);
    665 
    666652    /*
    667653     * Flush the instruction cache:
     
    670656    /* sys_dcache_flush(pv, cb); - not necessary */
    671657    sys_icache_invalidate(pv, cb);
     658    RT_NOREF(pVCpu);
    672659#elif defined(RT_OS_LINUX)
    673660    RT_NOREF(pVCpu);
     
    724711#endif
    725712    {
    726         uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunk;
     713        uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
    727714        fFound = offChunk < cbChunk;
    728715        if (fFound)
     
    738725            ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
    739726
    740 #if 0 /*def IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER - not necessary, we'll validate the header in the pruning code. */
    741 # ifdef RT_OS_DARWIN
    742             int rc = RTMemProtect(pHdr, sizeof(*pHdr), RTMEM_PROT_WRITE | RTMEM_PROT_READ);
    743             AssertRC(rc); RT_NOREF(pVCpu);
    744 # endif
    745             pHdr->uMagic    = 0;
    746             pHdr->idxChunk  = 0;
    747             pHdr->pTb       = NULL;
    748 # ifdef RT_OS_DARWIN
    749             rc = RTMemProtect(pHdr, sizeof(*pHdr), RTMEM_PROT_EXEC | RTMEM_PROT_READ);
    750             AssertRC(rc); RT_NOREF(pVCpu);
    751 # endif
     727            /* Invalidate the header using the writeable memory view. */
     728            pHdr = (PIEMEXECMEMALLOCHDR)((uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRw + offChunk);
     729#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
     730            pHdr->uMagic   = 0;
     731            pHdr->idxChunk = 0;
     732            pHdr->pTb      = NULL;
    752733#endif
    753734            pExecMemAllocator->aChunks[idxChunk].cFreeUnits  += cReqUnits;
     
    14041385    AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
    14051386
     1387#ifdef RT_OS_DARWIN
     1388    /*
     1389     * Because it is impossible to have a RWX memory allocation on macOS try to remap the memory
     1390     * chunk readable/executable somewhere else so we can save us the hassle of switching between
     1391     * protections when exeuctable memory is allocated.
     1392     */
     1393    mach_port_t       hPortTask    = mach_task_self();
     1394    mach_vm_address_t AddrChunk    = (mach_vm_address_t)pvChunk;
     1395    mach_vm_address_t AddrRemapped = 0;
     1396    vm_prot_t ProtCur, ProtMax;
     1397    kern_return_t krc = mach_vm_remap(hPortTask, &AddrRemapped, pExecMemAllocator->cbChunk, 0,
     1398                                      VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
     1399                                      hPortTask, AddrChunk, FALSE, &ProtCur, &ProtMax,
     1400                                      VM_INHERIT_NONE);
     1401    if (krc != KERN_SUCCESS)
     1402    {
     1403        RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
     1404        AssertLogRelFailed();
     1405        return VERR_NO_EXEC_MEMORY;
     1406    }
     1407
     1408    krc = mach_vm_protect(mach_task_self(), AddrRemapped, pExecMemAllocator->cbChunk, FALSE, VM_PROT_READ | VM_PROT_EXECUTE);
     1409    if (krc != KERN_SUCCESS)
     1410    {
     1411        krc = mach_vm_deallocate(hPortTask, AddrRemapped, pExecMemAllocator->cbChunk);
     1412        Assert(krc == KERN_SUCCESS);
     1413        RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
     1414        AssertLogRelFailed();
     1415        return VERR_NO_EXEC_MEMORY;
     1416    }
     1417
     1418    void *pvChunkRx = (void *)AddrRemapped;
     1419#else
     1420    void *pvChunkRx = pvChunk;
     1421#endif
     1422
    14061423    /*
    14071424     * Add the chunk.
     
    14101427     * memory from the chunk when using the alternative sub-allocator.
    14111428     */
    1412     pExecMemAllocator->aChunks[idxChunk].pvChunk      = pvChunk;
     1429    pExecMemAllocator->aChunks[idxChunk].pvChunkRw    = pvChunk;
     1430    pExecMemAllocator->aChunks[idxChunk].pvChunkRx    = pvChunkRx;
    14131431#ifdef IN_RING3
    14141432    pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
     
    14301448     * (This sets pvUnwindInfo.)
    14311449     */
    1432     int rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunk, idxChunk);
     1450    int rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunkRx, idxChunk);
    14331451    if (RT_SUCCESS(rc))
    14341452    { /* likely */ }
     
    14411459        memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
    14421460               0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
    1443         pExecMemAllocator->aChunks[idxChunk].pvChunk    = NULL;
     1461        pExecMemAllocator->aChunks[idxChunk].pvChunkRw  = NULL;
    14441462        pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
     1463
     1464#ifdef RT_OS_DARWIN
     1465        krc = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx,
     1466                                 pExecMemAllocator->cbChunk);
     1467        Assert(krc == KERN_SUCCESS);
     1468#endif
    14451469
    14461470        RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
     
    15401564        pExecMemAllocator->aChunks[i].cFreeUnits   = 0;
    15411565        pExecMemAllocator->aChunks[i].idxFreeHint  = 0;
    1542         pExecMemAllocator->aChunks[i].pvChunk      = NULL;
     1566        pExecMemAllocator->aChunks[i].pvChunkRw    = NULL;
    15431567#ifdef IN_RING0
    15441568        pExecMemAllocator->aChunks[i].hMemObj      = NIL_RTR0MEMOBJ;
  • trunk/src/VBox/VMM/VMMAll/IEMAllN8veRecompiler.cpp

    r104798 r104858  
    97079707        iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
    97089708
    9709     PIEMNATIVEINSTR const paFinalInstrBuf = (PIEMNATIVEINSTR)iemExecMemAllocatorAlloc(pVCpu, off * sizeof(IEMNATIVEINSTR), pTb);
     9709    PIEMNATIVEINSTR paFinalInstrBufRx = NULL;
     9710    PIEMNATIVEINSTR const paFinalInstrBuf = (PIEMNATIVEINSTR)iemExecMemAllocatorAlloc(pVCpu, off * sizeof(IEMNATIVEINSTR), pTb, (void **)&paFinalInstrBufRx);
    97109711    AssertReturn(paFinalInstrBuf, pTb);
    97119712    memcpy(paFinalInstrBuf, pReNative->pInstrBuf, off * sizeof(paFinalInstrBuf[0]));
     
    97689769    }
    97699770
    9770     iemExecMemAllocatorReadyForUse(pVCpu, paFinalInstrBuf, off * sizeof(IEMNATIVEINSTR));
     9771    iemExecMemAllocatorReadyForUse(pVCpu, paFinalInstrBufRx, off * sizeof(IEMNATIVEINSTR));
    97719772    STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTbNativeCode, off * sizeof(IEMNATIVEINSTR));
    97729773
     
    97759776     */
    97769777    RTMemFree(pTb->Thrd.paCalls);
    9777     pTb->Native.paInstructions  = paFinalInstrBuf;
     9778    pTb->Native.paInstructions  = paFinalInstrBufRx;
    97789779    pTb->Native.cInstructions   = off;
    97799780    pTb->fFlags                 = (pTb->fFlags & ~IEMTB_F_TYPE_MASK) | IEMTB_F_TYPE_NATIVE;
  • trunk/src/VBox/VMM/include/IEMInternal.h

    r104722 r104858  
    62536253DECLHIDDEN(void)    iemNativeDisassembleTb(PCIEMTB pTb, PCDBGFINFOHLP pHlp) RT_NOEXCEPT;
    62546254int                 iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk) RT_NOEXCEPT;
    6255 DECLHIDDEN(void *)  iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb) RT_NOEXCEPT;
     6255DECLHIDDEN(void *)  iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb, void **ppvExec) RT_NOEXCEPT;
    62566256DECLHIDDEN(void)    iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT;
    62576257void                iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb) RT_NOEXCEPT;
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette