VirtualBox

Changeset 104147 in vbox for trunk


Ignore:
Timestamp:
Apr 4, 2024 1:21:36 AM (10 months ago)
Author:
vboxsync
Message:

VMM/IEM: Optimize (?) the TLB code on ARM64 by using LDP and (for code) STP. Current disabled. Also a disabled native recompiler profiling tweak. bugref:10374

Location:
trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/iprt/armv8.h

    r104056 r104147  
    23682368}
    23692369
     2370
     2371/** A64: ldp x1, x2, [x3]   */
     2372DECL_FORCE_INLINE(uint32_t) Armv8A64MkInstrLdPairGpr(uint32_t iReg1, uint32_t iReg2, uint32_t iBaseReg, int32_t iImm7 = 0,
     2373                                                     ARM64INSTRSTLDPAIRTYPE enmType = kArm64InstrStLdPairType_Signed,
     2374                                                     bool f64Bit = true)
     2375{
     2376    return Armv8A64MkInstrStLdPair(true /*fLoad*/, f64Bit ? 2 : 0, enmType, iReg1, iReg2, iBaseReg, iImm7);
     2377}
     2378
     2379
     2380/** A64: stp x1, x2, [x3]   */
     2381DECL_FORCE_INLINE(uint32_t) Armv8A64MkInstrStPairGpr(uint32_t iReg1, uint32_t iReg2, uint32_t iBaseReg, int32_t iImm7 = 0,
     2382                                                     ARM64INSTRSTLDPAIRTYPE enmType = kArm64InstrStLdPairType_Signed,
     2383                                                     bool f64Bit = true)
     2384{
     2385    return Armv8A64MkInstrStLdPair(false /*fLoad*/, f64Bit ? 2 : 0, enmType, iReg1, iReg2, iBaseReg, iImm7);
     2386}
     2387
     2388
    23702389typedef enum                         /* Size VR Opc */
    23712390{                                    /*     \ | /   */
  • trunk/src/VBox/VMM/VMMAll/IEMAllN8veRecompiler.cpp

    r104144 r104147  
    81918191        ENTRY(iem.s.StatNativeCodeTlbHitsForNewPageWithOffset),
    81928192#endif
    8193         ENTRY(iem.s.DataTlb.aEntries),
    81948193        ENTRY(iem.s.DataTlb.uTlbRevision),
    81958194        ENTRY(iem.s.DataTlb.uTlbPhysRev),
    81968195        ENTRY(iem.s.DataTlb.cTlbHits),
    8197         ENTRY(iem.s.CodeTlb.aEntries),
     8196        ENTRY(iem.s.DataTlb.aEntries),
    81988197        ENTRY(iem.s.CodeTlb.uTlbRevision),
    81998198        ENTRY(iem.s.CodeTlb.uTlbPhysRev),
    82008199        ENTRY(iem.s.CodeTlb.cTlbHits),
     8200        ENTRY(iem.s.CodeTlb.aEntries),
    82018201        ENTRY(pVMR3),
    82028202        ENTRY(cpum.GstCtx.rax),
     
    89608960DECLHIDDEN(PIEMTB) iemNativeRecompile(PVMCPUCC pVCpu, PIEMTB pTb) RT_NOEXCEPT
    89618961{
     8962#if 0 /* For profiling the native recompiler code. */
     8963l_profile_again:
     8964#endif
    89628965    STAM_REL_PROFILE_START(&pVCpu->iem.s.StatNativeRecompilation, a);
    89638966
     
    92499252#endif
    92509253
     9254#if 0 /* For profiling the native recompiler code. */
     9255    if (pTb->Thrd.cCalls >= 136)
     9256    {
     9257        STAM_REL_PROFILE_STOP(&pVCpu->iem.s.StatNativeRecompilation, a);
     9258        goto l_profile_again;
     9259    }
     9260#endif
     9261
    92519262    /*
    92529263     * Allocate executable memory, copy over the code we've generated.
  • trunk/src/VBox/VMM/include/IEMInternal.h

    r104135 r104147  
    506506typedef struct IEMTLB
    507507{
    508     /** The TLB entries.
    509      * We've choosen 256 because that way we can obtain the result directly from a
    510      * 8-bit register without an additional AND instruction. */
    511     IEMTLBENTRY         aEntries[256];
    512508    /** The TLB revision.
    513509     * This is actually only 28 bits wide (see IEMTLBENTRY::uTag) and is incremented
     
    556552    /** Alignment padding. */
    557553    uint32_t            au32Padding[6];
     554
     555    /** The TLB entries.
     556     * We've choosen 256 because that way we can obtain the result directly from a
     557     * 8-bit register without an additional AND instruction. */
     558    IEMTLBENTRY         aEntries[256];
    558559} IEMTLB;
    559560AssertCompileSizeAlignment(IEMTLB, 64);
  • trunk/src/VBox/VMM/include/IEMN8veRecompilerTlbLookup.h

    r104145 r104147  
    8080#if defined(RT_ARCH_ARM64)
    8181    uint8_t const   idxReg3;
     82/** @def IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     83 * Use LDP and STDP to reduce number of instructions accessing memory at the
     84 * cost of using more registers.  This will typically reduce the number of
     85 * instructions emitted as well.  */
     86//# define IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     87# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     88    uint8_t const   idxReg4;
     89    uint8_t const   idxReg5;
     90# endif
    8291#endif
    8392    uint64_t const  uAbsPtr;
     
    125134#if defined(RT_ARCH_ARM64)
    126135        ,         idxReg3(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     136# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     137        ,         idxReg4(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     138        ,         idxReg5(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     139# endif
    127140#endif
    128141        ,         uAbsPtr(      a_pReNative->Core.aVars[IEMNATIVE_VAR_IDX_UNPACK(a_idxVarGCPtrMem)].enmKind
     
    160173#if defined(RT_ARCH_ARM64)
    161174        ,         idxReg3(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     175# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     176        ,         idxReg4(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     177        ,         idxReg5(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     178# endif
    162179#endif
    163180        ,         uAbsPtr(UINT64_MAX)
     
    189206#if defined(RT_ARCH_ARM64)
    190207        ,         idxReg3(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     208# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     209        ,         idxReg4(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     210        ,         idxReg5(!fSkip ? iemNativeRegAllocTmp(a_pReNative, a_poff) : UINT8_MAX)
     211# endif
    191212#endif
    192213        ,         uAbsPtr(UINT64_MAX)
     
    228249            iemNativeRegFreeTmp(a_pReNative, idxRegSegAttrib);
    229250#if defined(RT_ARCH_ARM64)
     251# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     252        iemNativeRegFreeTmp(a_pReNative, idxReg5);
     253        iemNativeRegFreeTmp(a_pReNative, idxReg4);
     254# endif
    230255        iemNativeRegFreeTmp(a_pReNative, idxReg3);
    231256#endif
     
    242267#if defined(RT_ARCH_ARM64)
    243268                 | RT_BIT_32(idxReg3)
     269# ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     270                 | RT_BIT_32(idxReg4)
     271                 | RT_BIT_32(idxReg5)
     272# endif
    244273#endif
    245274                 ;
     
    592621    off = iemNativeEmitGprByVCpuDisp(pCodeBuf, off, pTlbState->idxReg1, offVCpuTlb + RT_UOFFSETOF(IEMTLB, uTlbRevision));
    593622# else
     623#  ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     624    /* Load uTlbRevision into reg3 and uTlbPhysRev into reg5.
     625       We load the offVCpuTlb + aEntries into reg4 and use it for addressing here
     626       and later when calculating pTble (save an instruction). */
     627    AssertCompileMemberAlignment(IEMTLB, uTlbRevision, 16); /* It is said that misaligned pair loads doesn't perform well. */
     628    AssertCompileAdjacentMembers(IEMTLB, uTlbRevision, uTlbPhysRev);
     629    AssertCompile(RTASSERT_OFFSET_OF(IEMTLB, uTlbPhysRev) < RTASSERT_OFFSET_OF(IEMTLB, aEntries));
     630    AssertCompile(RTASSERT_OFFSET_OF(VMCPUCC, iem.s.DataTlb.aEntries) < _64K);
     631    AssertCompile(RTASSERT_OFFSET_OF(VMCPUCC, iem.s.CodeTlb.aEntries) < _64K); /* if larger do: ADD x3, x27, x3, LSL #y */
     632    pCodeBuf[off++] = Armv8A64MkInstrMovZ(pTlbState->idxReg4, offVCpuTlb + RT_UOFFSETOF(IEMTLB, aEntries));
     633    pCodeBuf[off++] = Armv8A64MkInstrAddReg(pTlbState->idxReg4, IEMNATIVE_REG_FIXED_PVMCPU, pTlbState->idxReg4);
     634    pCodeBuf[off++] = Armv8A64MkInstrLdPairGpr(pTlbState->idxReg3, pTlbState->idxReg5, pTlbState->idxReg4,
     635                                               (RT_OFFSETOF(IEMTLB, uTlbRevision) - RT_OFFSETOF(IEMTLB, aEntries)) / 8);
     636#  else
    594637    off = iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, pTlbState->idxReg3, offVCpuTlb + RT_UOFFSETOF(IEMTLB, uTlbRevision));
     638#  endif
    595639    off = iemNativeEmitOrGprByGprEx(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg3);
    596640# endif
     
    599643     * 3b. Calc pTlbe.
    600644     */
     645# if !defined(RT_ARCH_ARM64) || !defined(IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR)
    601646    uint32_t const offTlbEntries = offVCpuTlb + RT_UOFFSETOF(IEMTLB, aEntries);
     647# endif
    602648# if defined(RT_ARCH_AMD64)
    603649    /* movzx reg2, byte reg1 */
     
    620666    /* reg2 = (reg1 & 0xff) << 5 */
    621667    pCodeBuf[off++] = Armv8A64MkInstrUbfiz(pTlbState->idxReg2, pTlbState->idxReg1, 5, 8);
     668#  ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     669    /* reg2 += &pVCpu->iem.s.DataTlb.aEntries / CodeTlb.aEntries */
     670    pCodeBuf[off++] = Armv8A64MkInstrAddReg(pTlbState->idxReg2, pTlbState->idxReg2, pTlbState->idxReg4);
     671#  else
    622672    /* reg2 += offsetof(VMCPUCC, iem.s.DataTlb.aEntries) */
    623673    off = iemNativeEmitAddGprImmEx(pCodeBuf, off, pTlbState->idxReg2, offTlbEntries, pTlbState->idxReg3 /*iGprTmp*/);
    624674    /* reg2 += pVCpu */
    625675    off = iemNativeEmitAddTwoGprsEx(pCodeBuf, off, pTlbState->idxReg2, IEMNATIVE_REG_FIXED_PVMCPU);
     676#  endif
    626677# else
    627678#  error "Port me"
     
    637688    off = iemNativeEmitGprByGprDisp(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, uTag));
    638689# elif defined(RT_ARCH_ARM64)
     690#  ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     691    AssertCompileMemberAlignment(IEMTLBENTRY, uTag, 16); /* It is said that misaligned pair loads doesn't perform well. */
     692    AssertCompile(RT_UOFFSETOF(IEMTLBENTRY, uTag) + sizeof(uint64_t) == RT_UOFFSETOF(IEMTLBENTRY, fFlagsAndPhysRev));
     693    pCodeBuf[off++] = Armv8A64MkInstrLdPairGpr(pTlbState->idxReg3, pTlbState->idxReg4,
     694                                               pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, uTag) / 8);
     695#  else
    639696    off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg3, pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, uTag));
     697#  endif
    640698    off = iemNativeEmitCmpGprWithGprEx(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg3);
    641699# else
     
    673731                                    offVCpuTlb + RT_UOFFSETOF(IEMTLB, uTlbPhysRev));
    674732# elif defined(RT_ARCH_ARM64)
    675     off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg3, pTlbState->idxReg2,
    676                                          RT_UOFFSETOF(IEMTLBENTRY, fFlagsAndPhysRev));
     733#  ifdef IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR
     734    pCodeBuf[off++] = Armv8A64MkInstrAnd(pTlbState->idxReg1, pTlbState->idxReg1, pTlbState->idxReg4);
     735    off = iemNativeEmitCmpGprWithGprEx(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg5);
     736#  else
     737    off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg3,
     738                                         pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, fFlagsAndPhysRev));
    677739    pCodeBuf[off++] = Armv8A64MkInstrAnd(pTlbState->idxReg1, pTlbState->idxReg1, pTlbState->idxReg3);
    678740    off = iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, pTlbState->idxReg3, offVCpuTlb + RT_UOFFSETOF(IEMTLB, uTlbPhysRev));
    679741    off = iemNativeEmitCmpGprWithGprEx(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg3);
     742#  endif
    680743# else
    681744#  error "Port me"
     
    691754     *    IEMCPU members and we return a GCPhys address rather than a host pointer.
    692755     */
    693     /* mov  reg1, [reg2->pbMappingR3] */
    694     off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg2,
    695                                          RT_UOFFSETOF(IEMTLBENTRY, pbMappingR3));
     756# if defined(RT_ARCH_ARM64) && defined(IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR)
     757    if (!a_fDataTlb)
     758    {
     759        /* ldp  reg4, reg1, [reg2->GCPhys+pbMappingR3] */
     760        AssertCompileMemberAlignment(IEMTLBENTRY, GCPhys, 16);
     761        AssertCompileAdjacentMembers(IEMTLBENTRY, GCPhys, pbMappingR3);
     762        pCodeBuf[off++] = Armv8A64MkInstrLdPairGpr(pTlbState->idxReg4, pTlbState->idxReg1,
     763                                                   pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, GCPhys) / 8);
     764    }
     765    else
     766# endif
     767    {
     768        /* mov  reg1, [reg2->pbMappingR3] */
     769        off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg1, pTlbState->idxReg2,
     770                                             RT_UOFFSETOF(IEMTLBENTRY, pbMappingR3));
     771    }
    696772    /* if (!reg1) goto tlbmiss; */
    697773    /** @todo eliminate the need for this test? */
     
    723799         * Note. We do not need to set offCurInstrStart or offInstrNextByte.
    724800         */
    725 # ifdef RT_ARCH_AMD64
     801# if !defined(RT_ARCH_ARM64) || !defined(IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR)
     802#  ifdef RT_ARCH_AMD64
    726803        uint8_t const idxReg3 = UINT8_MAX;
    727 # else
     804#  else
    728805        uint8_t const idxReg3 = pTlbState->idxReg3;
    729 # endif
     806#  endif
    730807        /* Set pbInstrBuf first since we've got it loaded already. */
    731808        off = iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, pTlbState->idxReg1,
     
    739816                                               pTlbState->idxReg1, idxReg3);
    740817        /* Now set GCPhysInstrBuf last as we'll be returning it in idxRegMemResult. */
     818#  if defined(RT_ARCH_ARM64) && defined(IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR)
     819        off = iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, pTlbState->idxReg4,
     820                                               RT_UOFFSETOF(VMCPUCC, iem.s.GCPhysInstrBuf), idxReg3);
     821#  else
    741822        off = iemNativeEmitLoadGprByGprU64Ex(pCodeBuf, off, pTlbState->idxReg1,
    742823                                             pTlbState->idxReg2, RT_UOFFSETOF(IEMTLBENTRY, GCPhys));
    743824        off = iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, pTlbState->idxReg1,
    744825                                               RT_UOFFSETOF(VMCPUCC, iem.s.GCPhysInstrBuf), idxReg3);
     826#  endif
     827# else
     828        /* ARM64: Same as above but using STP. This ASSUMES that we can trash
     829                  the 6 bytes following iem.s.cbInstrBufTotal! */
     830        AssertCompileMemberAlignment(VMCPUCC, iem.s.pbInstrBuf, 16);
     831        AssertCompileAdjacentMembers(VMCPUCC, iem.s.pbInstrBuf, iem.s.uInstrBufPc);
     832        AssertCompile(RT_UOFFSETOF(VMCPUCC, iem.s.GCPhysInstrBuf) < 512);
     833        /* idxReg1 = reg2->pbMappingR3 (see previous LDP) */
     834        /* idxReg3 = FlatPC & ~GUEST_PAGE_OFFSET_MASK. */
     835        off = iemNativeEmitGprEqGprAndImmEx(pCodeBuf, off, pTlbState->idxReg3, idxRegFlatPtr, ~(RTGCPTR)GUEST_PAGE_OFFSET_MASK);
     836        pCodeBuf[off++] = Armv8A64MkInstrStPairGpr(pTlbState->idxReg1, pTlbState->idxReg3,
     837                                                   IEMNATIVE_REG_FIXED_PVMCPU, RT_UOFFSETOF(VMCPUCC, iem.s.pbInstrBuf) / 8);
     838
     839        AssertCompileMemberAlignment(VMCPUCC, iem.s.GCPhysInstrBuf, 16);
     840        AssertCompileAdjacentMembers(VMCPUCC, iem.s.GCPhysInstrBuf, iem.s.cbInstrBufTotal);
     841        AssertCompile(RT_UOFFSETOF(VMCPUCC, iem.s.GCPhysInstrBuf) < 512);
     842#  ifndef IEM_WITH_OPAQUE_DECODER_STATE
     843        AssertCompileAdjacentMembers(VMCPUCC, iem.s.cbInstrBufTotal, iem.s.offCurInstrStart);
     844        AssertCompileAdjacentMembers(VMCPUCC, iem.s.offCurInstrStart, iem.s.fPrefixes);       /* these two will be set to ~0. */
     845#  endif
     846        /* idxReg4 = reg2->GCPhys (see previous LDP) */
     847        /* idxReg3 = GUEST_PAGE_SIZE | UINT64_C(0xffffffffffff0000) */
     848        pCodeBuf[off++] = Armv8A64MkInstrMovN(pTlbState->idxReg3, ~GUEST_PAGE_SIZE & 0xffff);
     849        pCodeBuf[off++] = Armv8A64MkInstrStPairGpr(pTlbState->idxReg4, pTlbState->idxReg3,
     850                                                   IEMNATIVE_REG_FIXED_PVMCPU, RT_UOFFSETOF(VMCPUCC, iem.s.GCPhysInstrBuf) / 8);
     851# endif
    745852        if (!a_fNoReturn) /* (We skip this for iemNativeEmitBltLoadTlbAfterBranch.) */
    746853        {
     
    750857            else
    751858                off = iemNativeEmitGpr32EqGprAndImmEx(pCodeBuf, off, idxRegMemResult, idxRegFlatPtr, GUEST_PAGE_OFFSET_MASK);
     859# if defined(RT_ARCH_ARM64) && defined(IEMNATIVE_WITH_TLB_LOOKUP_LOAD_STORE_PAIR)
     860            off = iemNativeEmitAddTwoGprsEx(pCodeBuf, off, idxRegMemResult, pTlbState->idxReg4);
     861# else
    752862            off = iemNativeEmitAddTwoGprsEx(pCodeBuf, off, idxRegMemResult, pTlbState->idxReg1);
     863# endif
    753864        }
    754865    }
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette