VirtualBox

Changeset 106545 in vbox for trunk/include


Ignore:
Timestamp:
Oct 21, 2024 7:52:03 AM (7 months ago)
Author:
vboxsync
svn:sync-xref-src-repo-rev:
165379
Message:

include/iprt/asm.h: Make it build on win.arm64, bugref:10392

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/iprt/asm.h

    r106061 r106545  
    5757# include <iprt/sanitized/intrin.h>
    5858# pragma intrinsic(_ReadWriteBarrier)
    59 # pragma intrinsic(__cpuid)
    60 # pragma intrinsic(__stosd)
    61 # pragma intrinsic(__stosw)
    62 # pragma intrinsic(__stosb)
     59# if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
     60#  pragma intrinsic(__cpuid)
     61#  pragma intrinsic(__stosd)
     62#  pragma intrinsic(__stosw)
     63#  pragma intrinsic(__stosb)
     64#  ifdef RT_ARCH_AMD64
     65#   pragma intrinsic(__stosq)
     66#   pragma intrinsic(_byteswap_uint64)
     67#   pragma intrinsic(_InterlockedCompareExchange128)
     68#   pragma intrinsic(_InterlockedExchange64)
     69#   pragma intrinsic(_InterlockedExchangeAdd64)
     70#   pragma intrinsic(_InterlockedAnd64)
     71#   pragma intrinsic(_InterlockedOr64)
     72#   pragma intrinsic(_InterlockedIncrement64)
     73#   pragma intrinsic(_InterlockedDecrement64)
     74#  endif
     75# elif defined(RT_ARCH_ARM64)
     76#   pragma intrinsic(__break)
     77#   pragma intrinsic(__dmb)
     78#   pragma intrinsic(__dsb)
     79#   pragma intrinsic(__isb)
     80#   pragma intrinsic(__nop)
     81#   pragma intrinsic(__yield)
     82#   pragma intrinsic(__swp8)
     83#   pragma intrinsic(__swpa8)
     84#   pragma intrinsic(__swpal8)
     85#   pragma intrinsic(__swp16)
     86#   pragma intrinsic(__swpa16)
     87#   pragma intrinsic(__swpal16)
     88#   pragma intrinsic(__swp32)
     89#   pragma intrinsic(__swpa32)
     90#   pragma intrinsic(__swpal32)
     91#   pragma intrinsic(__swp64)
     92#   pragma intrinsic(__swpa64)
     93#   pragma intrinsic(__swpal64)
     94#   pragma intrinsic(__cas8)
     95#   pragma intrinsic(__casl8)
     96#   pragma intrinsic(__cas16)
     97#   pragma intrinsic(__casl16)
     98#   pragma intrinsic(__cas32)
     99#   pragma intrinsic(__casl32)
     100#   pragma intrinsic(__cas64)
     101#   pragma intrinsic(__casl64)
     102#   pragma intrinsic(__casa8)
     103#   pragma intrinsic(__casal8)
     104#   pragma intrinsic(__casa16)
     105#   pragma intrinsic(__casa64)
     106#   pragma intrinsic(__iso_volatile_load8)
     107#   pragma intrinsic(__iso_volatile_load16)
     108#   pragma intrinsic(__iso_volatile_load32)
     109#   pragma intrinsic(__iso_volatile_load64)
     110#   pragma intrinsic(__iso_volatile_store8)
     111#   pragma intrinsic(__iso_volatile_store16)
     112#   pragma intrinsic(__iso_volatile_store32)
     113#   pragma intrinsic(__iso_volatile_store64)
     114#   pragma intrinsic(__load_acquire8)
     115#   pragma intrinsic(__load_acquire16)
     116#   pragma intrinsic(__load_acquire32)
     117#   pragma intrinsic(__load_acquire64)
     118#   pragma intrinsic(__stlr8)
     119#   pragma intrinsic(__stlr16)
     120#   pragma intrinsic(__stlr32)
     121#   pragma intrinsic(__stlr64)
     122# else
     123#  error "Port me"
     124# endif
    63125# pragma intrinsic(_BitScanForward)
    64126# pragma intrinsic(_BitScanReverse)
     
    86148# pragma intrinsic(_rotl64)
    87149# pragma intrinsic(_rotr64)
    88 # ifdef RT_ARCH_AMD64
    89 #  pragma intrinsic(__stosq)
    90 #  pragma intrinsic(_byteswap_uint64)
    91 #  pragma intrinsic(_InterlockedCompareExchange128)
    92 #  pragma intrinsic(_InterlockedExchange64)
    93 #  pragma intrinsic(_InterlockedExchangeAdd64)
    94 #  pragma intrinsic(_InterlockedAnd64)
    95 #  pragma intrinsic(_InterlockedOr64)
    96 #  pragma intrinsic(_InterlockedIncrement64)
    97 #  pragma intrinsic(_InterlockedDecrement64)
    98 # endif
    99 #endif
    100 
    101 #if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
     150#endif
     151
     152#if (defined(RT_ARCH_ARM64) && (defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))) || defined(DOXYGEN_RUNNING)
    102153/** @def RTASM_ARM64_USE_FEAT_LSE
    103154 * Use instructions from the FEAT_LSE set to implement atomic operations,
     
    475526
    476527# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
     528
     529#  if RT_INLINE_ASM_USES_INTRIN
     530    __yield();
     531#  else
    477532    __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
     533#  endif
    478534
    479535# else
     
    522578
    523579# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
     580
     581#  if RT_INLINE_ASM_USES_INTRIN
     582    uint8_t uOld;
     583#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     584    uOld = __swpal8(pu8, u8);
     585#   else
     586    uOld = __swp8(pu8, u8);
     587    __dmb(_ARM64_BARRIER_SY);
     588#   endif
     589    return uOld;
     590
     591#  else
    524592    uint32_t uOld;
    525 if defined(RTASM_ARM64_USE_FEAT_LSE)
     593 if defined(RTASM_ARM64_USE_FEAT_LSE)
    526594    /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
    527595       have the barrier we shouldn't need that, right? Ordering should be taken
    528596       care of by the DMB. The SWPB is rather cheap (~70% faster). */
    529597    __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
    530 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     598#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    531599                         "swpalb    %w[uNew], %w[uOld], %[pMem]\n\t"
    532 #   else
     600#    else
    533601                         RTASM_ARM_DMB_SY
    534602                         "swpb      %w[uNew], %w[uOld], %[pMem]\n\t"
    535 #   endif
     603#    endif
    536604                         : [pMem] "+Q" (*pu8)
    537605                         , [uOld] "=&r" (uOld)
     
    542610    __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
    543611                         RTASM_ARM_DMB_SY
    544 #   if defined(RT_ARCH_ARM64)
     612#    if defined(RT_ARCH_ARM64)
    545613                         "ldaxrb    %w[uOld], %[pMem]\n\t"
    546614                         "stlxrb    %w[rc], %w[uNew], %[pMem]\n\t"
    547615                         "cbnz      %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
    548 #   else
     616#    else
    549617                         "ldrexb    %[uOld], %[pMem]\n\t"      /* ARMv6+ */
    550618                         "strexb    %[rc], %[uNew], %[pMem]\n\t"
    551619                         "cmp       %[rc], #0\n\t"
    552620                         "bne       Ltry_again_ASMAtomicXchgU8_%=\n\t"
    553 #   endif
     621#    endif
    554622                         : [pMem] "+Q" (*pu8)
    555623                         , [uOld] "=&r" (uOld)
     
    558626                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    559627                         : "cc");
    560 endif
     628 endif
    561629    return (uint8_t)uOld;
     630#  endif
    562631
    563632# else
     
    636705
    637706# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
     707
     708#  if RT_INLINE_ASM_USES_INTRIN
     709    uint16_t uOld;
     710#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     711    uOld = __swpal16(pu16, u16);
     712#   else
     713    uOld = __swp16(pu16, u16);
     714    __dmb(_ARM64_BARRIER_SY);
     715#   endif
     716    return uOld;
     717
     718#  else
    638719    uint32_t uOld;
    639 if defined(RTASM_ARM64_USE_FEAT_LSE)
     720 if defined(RTASM_ARM64_USE_FEAT_LSE)
    640721    /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
    641722       slower if we remove the barrier.  But since we have the barrier we
     
    643724       The SWPH is rather cheap (~70% faster). */
    644725    __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
    645 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     726#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    646727                         "swpalh    %w[uNew], %w[uOld], %[pMem]\n\t"
    647 #   else
     728#    else
    648729                         RTASM_ARM_DMB_SY
    649730                         "swph      %w[uNew], %w[uOld], %[pMem]\n\t"
    650 #   endif
     731#    endif
    651732                         : [pMem] "+Q" (*pu16)
    652733                         , [uOld] "=&r" (uOld)
    653734                         : [uNew] "r" ((uint32_t)u16)
    654735                         : );
    655 else
     736 else
    656737    uint32_t rcSpill;
    657738    __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
    658739                         RTASM_ARM_DMB_SY
    659 #   if defined(RT_ARCH_ARM64)
     740#    if defined(RT_ARCH_ARM64)
    660741                         "ldaxrh    %w[uOld], %[pMem]\n\t"
    661742                         "stlxrh    %w[rc], %w[uNew], %[pMem]\n\t"
    662743                         "cbnz      %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
    663 #   else
     744#    else
    664745                         "ldrexh    %[uOld], %[pMem]\n\t"      /* ARMv6+ */
    665746                         "strexh    %[rc], %[uNew], %[pMem]\n\t"
    666747                         "cmp       %[rc], #0\n\t"
    667748                         "bne       Ltry_again_ASMAtomicXchgU16_%=\n\t"
    668 #   endif
     749#    endif
    669750                         : [pMem] "+Q" (*pu16)
    670751                         , [uOld] "=&r" (uOld)
     
    673754                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    674755                         : "cc");
    675 endif
     756 endif
    676757    return (uint16_t)uOld;
     758#endif
    677759
    678760# else
     
    740822
    741823# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
     824
     825#  if RT_INLINE_ASM_USES_INTRIN
    742826    uint32_t uOld;
    743 #  if defined(RTASM_ARM64_USE_FEAT_LSE)
     827#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     828    uOld = __swpal32(pu32, u32);
     829#   else
     830    uOld = __swp32(pu32, u32);
     831    __dmb(_ARM64_BARRIER_SY);
     832#   endif
     833    return uOld;
     834
     835#  else
     836    uint32_t uOld;
     837#   if defined(RTASM_ARM64_USE_FEAT_LSE)
    744838    /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
    745839       slower if we remove the barrier.  But since we have the barrier we
     
    747841       The SWP is rather cheap (~70% faster). */
    748842    __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
    749 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     843#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    750844                         "swpal     %w[uNew], %w[uOld], %[pMem]\n\t"
    751 #   else
     845#    else
    752846                         RTASM_ARM_DMB_SY
    753847                         "swp       %w[uNew], %w[uOld], %[pMem]\n\t"
    754 #   endif
     848#    endif
    755849                         : [pMem] "+Q" (*pu32)
    756850                         , [uOld] "=&r" (uOld)
    757851                         : [uNew] "r" (u32)
    758852                         : );
    759 else
     853 else
    760854    uint32_t rcSpill;
    761855    __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
    762856                         RTASM_ARM_DMB_SY
    763 #   if defined(RT_ARCH_ARM64)
     857#    if defined(RT_ARCH_ARM64)
    764858                         "ldaxr     %w[uOld], %[pMem]\n\t"
    765859                         "stlxr     %w[rc], %w[uNew], %[pMem]\n\t"
    766860                         "cbnz      %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
    767 #   else
     861#    else
    768862                         "ldrex     %[uOld], %[pMem]\n\t"      /* ARMv6+ */
    769863                         "strex     %[rc], %[uNew], %[pMem]\n\t"
    770864                         "cmp       %[rc], #0\n\t"
    771865                         "bne       Ltry_again_ASMAtomicXchgU32_%=\n\t"
    772 #   endif
     866#    endif
    773867                         : [pMem] "+Q"  (*pu32)
    774868                         , [uOld] "=&r" (uOld)
     
    777871                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    778872                         : "cc");
    779 endif
     873 endif
    780874    return uOld;
     875#  endif
    781876
    782877# else
     
    884979
    885980# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
     981
     982#  if RT_INLINE_ASM_USES_INTRIN
    886983    uint64_t uOld;
    887 #  if defined(RTASM_ARM64_USE_FEAT_LSE)
     984#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     985    uOld = __swpal64(pu64, u64);
     986#   else
     987    uOld = __swp64(pu64, u64);
     988#   endif
     989    return uOld;
     990
     991#  else
     992    uint64_t uOld;
     993#   if defined(RTASM_ARM64_USE_FEAT_LSE)
    888994    /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
    889995       slower if we remove the barrier.  But since we have the barrier we
     
    891997       The SWP is rather cheap (~70% faster). */
    892998    __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
    893 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     999#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    8941000                         "swpal     %[uNew], %[uOld], %[pMem]\n\t"
    895 #   else
     1001#    else
    8961002                         RTASM_ARM_DMB_SY
    8971003                         "swp       %[uNew], %[uOld], %[pMem]\n\t"
    898 #   endif
     1004#    endif
    8991005                         : [pMem] "+Q" (*pu64)
    9001006                         , [uOld] "=&r" (uOld)
    9011007                         : [uNew] "r" (u64)
    9021008                         : );
    903 else
     1009 else
    9041010    uint32_t rcSpill;
    9051011    __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
    9061012                         RTASM_ARM_DMB_SY
    907 #   if defined(RT_ARCH_ARM64)
     1013#    if defined(RT_ARCH_ARM64)
    9081014                         "ldaxr     %[uOld], %[pMem]\n\t"
    9091015                         "stlxr     %w[rc], %[uNew], %[pMem]\n\t"
    9101016                         "cbnz      %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
    911 #   else
     1017#    else
    9121018                         "ldrexd    %[uOld], %H[uOld], %[pMem]\n\t"      /* ARMv6+ */
    9131019                         "strexd    %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
    9141020                         "cmp       %[rc], #0\n\t"
    9151021                         "bne       Ltry_again_ASMAtomicXchgU64_%=\n\t"
    916 #   endif
     1022#    endif
    9171023                         : [pMem] "+Q"  (*pu64)
    9181024                         , [uOld] "=&r" (uOld)
     
    9211027                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    9221028                         : "cc");
    923 endif
     1029 endif
    9241030    return uOld;
     1031#  endif
    9251032
    9261033# else
     
    11401247 * @todo Rename ASMAtomicCmpWriteU8
    11411248 */
    1142 #if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE
     1249#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || (!RT_INLINE_ASM_GNU_STYLE && !defined(RT_ARCH_ARM64))
    11431250RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
    11441251#else
     
    11571264                         : "cc");
    11581265    return (bool)u8Ret;
     1266
     1267#  elif RT_INLINE_ASM_USES_INTRIN
     1268    return (uint8_t)_InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old) == u8Old;
    11591269
    11601270# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     
    13181428
    13191429# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     1430
     1431#  ifdef RT_INLINE_ASM_USES_INTRIN
     1432    uint32_t uOldActual;
     1433#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     1434    uOldActual = __casal32(pu32, u32Old, u32New);
     1435#   else
     1436    uOldActual = __casal32(pu32, u32Old, u32New);
     1437    __dmb(_ARM64_BARRIER_SY);
     1438#   endif
     1439    return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
     1440
     1441#  else
    13201442    union { uint32_t u; bool f; } fXchg;
    13211443    uint32_t u32Spill;
    13221444    /* M1 bench:   match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
    13231445                mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
    1324 if defined(RTASM_ARM64_USE_FEAT_LSE)
     1446 if defined(RTASM_ARM64_USE_FEAT_LSE)
    13251447    __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
    1326 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     1448#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    13271449                         "casal     %w[uOldActual], %w[uNew], %[pMem]\n\t"
    1328 #   else
     1450#    else
    13291451                         RTASM_ARM_DMB_SY
    13301452                         "cas       %w[uOldActual], %w[uNew], %[pMem]\n\t"
    1331 #   endif
     1453#    endif
    13321454                         "cmp       %w[uOldActual], %w[uOldOrg]\n\t"
    13331455                         "cset      %w[fXchg], eq\n\t"
     
    13391461                         , "[uOldActual]"     (u32Old)
    13401462                         : "cc");
    1341 else
     1463 else
    13421464    uint32_t rcSpill;
    13431465    __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
    13441466                         RTASM_ARM_DMB_SY
    1345 #   if defined(RT_ARCH_ARM64)
     1467#    if defined(RT_ARCH_ARM64)
    13461468                         "ldaxr     %w[uOld], %[pMem]\n\t"
    13471469                         "cmp       %w[uOld], %w[uCmp]\n\t"
     
    13521474                         "1:\n\t"
    13531475                         "clrex\n\t"
    1354 #   else
     1476#    else
    13551477                         "ldrex     %[uOld], %[pMem]\n\t"
    13561478                         "teq       %[uOld], %[uCmp]\n\t"
     
    13621484                         "1:\n\t"
    13631485                         /** @todo clrexne on armv7? */
    1364 #   endif
     1486#    endif
    13651487                         : [pMem]   "+Q"  (*pu32)
    13661488                         , [uOld]   "=&r" (u32Spill)
     
    13721494                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    13731495                         : "cc");
     1496#    endif
     1497    return fXchg.f;
    13741498#   endif
    1375     return fXchg.f;
    13761499
    13771500# else
     
    15041627
    15051628# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     1629
     1630#  ifdef RT_INLINE_ASM_USES_INTRIN
     1631    uint64_t uOldActual;
     1632#   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     1633    uOldActual = __casal64(pu64, u64Old, u64New);
     1634#   else
     1635    uOldActual = __casal64(pu64, u64Old, u64New);
     1636    __dmb(_ARM64_BARRIER_SY);
     1637#   endif
     1638    return uOldActual == u64Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
     1639
     1640#  else
    15061641    union { uint32_t u; bool f; } fXchg;
    15071642    uint64_t u64Spill;
    15081643    /* M1 bench:   match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
    15091644                mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
    1510 if defined(RTASM_ARM64_USE_FEAT_LSE)
     1645 if defined(RTASM_ARM64_USE_FEAT_LSE)
    15111646    __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
    1512 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     1647#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    15131648                         "casal     %[uOldActual], %[uNew], %[pMem]\n\t"
    1514 #   else
     1649#    else
    15151650                         RTASM_ARM_DMB_SY
    15161651                         "cas       %[uOldActual], %[uNew], %[pMem]\n\t"
    1517 #   endif
     1652#    endif
    15181653                         "cmp       %[uOldActual], %[uOldOrg]\n\t"
    15191654                         "cset      %w[fXchg], eq\n\t"
     
    15251660                         , "[uOldActual]"     (u64Old)
    15261661                         : "cc");
    1527 else
     1662 else
    15281663    uint32_t rcSpill;
    15291664    __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
    15301665                         RTASM_ARM_DMB_SY
    1531 #   if defined(RT_ARCH_ARM64)
     1666#    if defined(RT_ARCH_ARM64)
    15321667                         "ldaxr     %[uOld], %[pMem]\n\t"
    15331668                         "cmp       %[uOld], %[uCmp]\n\t"
     
    15381673                         "1:\n\t"
    15391674                         "clrex\n\t"
    1540 #   else
     1675#    else
    15411676                         "ldrexd    %[uOld], %H[uOld], %[pMem]\n\t"
    15421677                         "teq       %[uOld], %[uCmp]\n\t"
     
    15491684                         "1:\n\t"
    15501685                         /** @todo clrexne on armv7? */
    1551 #   endif
     1686#    endif
    15521687                         : [pMem]   "+Q"  (*pu64)
    15531688                         , [uOld]   "=&r" (u64Spill)
     
    15591694                           RTASM_ARM_DMB_SY_COMMA_IN_REG
    15601695                         : "cc");
    1561 endif
     1696 endif
    15621697    return fXchg.f;
     1698#  endif
    15631699
    15641700# else
     
    18752011    /* M1 bench:   match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
    18762012                mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
    1877 #  if defined(RTASM_ARM64_USE_FEAT_LSE)
     2013#  ifdef RT_INLINE_ASM_USES_INTRIN
     2014#   if defined(RTASM_ARM64_USE_FEAT_LSE)
     2015    uint8_t uOldActual;
     2016#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2017    uOldActual = __casal8(pu8, u8Old, u8New);
     2018#    else
     2019    uOldActual = __casal8(pu8, u8Old, u8New);
     2020    __dmb(_ARM64_BARRIER_SY);
     2021#    endif
     2022     return (*pu8Old = uOldActual) == u8Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
     2023#   else
     2024    return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
     2025#   endif
     2026
     2027#  else
     2028
     2029#   if defined(RTASM_ARM64_USE_FEAT_LSE)
    18782030    union { uint32_t u; bool f; } fXchg;
    18792031    uint32_t u32Actual;
    18802032    __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
    1881 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2033#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    18822034                         "casalb    %w[uOldActual], %w[uNew], %[pMem]\n\t"
    1883 #   else
     2035#    else
    18842036                         RTASM_ARM_DMB_SY
    18852037                         "casb      %w[uOldActual], %w[uNew], %[pMem]\n\t"
    1886 #   endif
     2038#    endif
    18872039                         "cmp       %w[uOldActual], %w[uOldOrg]\n\t"
    18882040                         "cset      %w[fXchg], eq\n\t"
     
    18952047                         : "cc");
    18962048    *pu8Old = (uint8_t)u32Actual;
    1897 else
     2049 else
    18982050    union { uint8_t u; bool f; } fXchg;
    18992051    uint8_t u8ActualOld;
     
    19012053    __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
    19022054                         RTASM_ARM_DMB_SY
    1903 #   if defined(RT_ARCH_ARM64)
     2055#    if defined(RT_ARCH_ARM64)
    19042056                         "ldaxrb    %w[uOld], %[pMem]\n\t"
    19052057                         "cmp       %w[uOld], %w[uCmp]\n\t"
     
    19102062                         "1:\n\t"
    19112063                         "clrex\n\t"
    1912 #   else
     2064#    else
    19132065                         "ldrexb     %[uOld], %[pMem]\n\t"
    19142066                         "teq       %[uOld], %[uCmp]\n\t"
     
    19202072                         "1:\n\t"
    19212073                         /** @todo clrexne on armv7? */
    1922 #   endif
     2074#    endif
    19232075                         : [pMem]   "+Q"  (*pu8)
    19242076                         , [uOld]   "=&r" (u8ActualOld)
     
    19312083                         : "cc");
    19322084    *pu8Old = u8ActualOld;
    1933 # endif
     2085#   endif
    19342086    return fXchg.f;
     2087#  endif
    19352088
    19362089# else
     
    20272180    /* M1 bench:   match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
    20282181                mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
    2029 #  if defined(RTASM_ARM64_USE_FEAT_LSE)
     2182#  ifdef RT_INLINE_ASM_USES_INTRIN
     2183#   if defined(RTASM_ARM64_USE_FEAT_LSE)
     2184    uint16_t uOldActual;
     2185#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2186    uOldActual = __casal16(pu16, u16Old, u16New);
     2187#    else
     2188    uOldActual = __casal16(pu16, u16Old, u16New);
     2189    __dmb(_ARM64_BARRIER_SY);
     2190#    endif
     2191     return (*pu16Old = uOldActual) == u16Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
     2192#   else
     2193    return (*pu16Old = _InterlockedCompareExchange16((char RT_FAR *)pu16, u16New, u16Old)) == u16Old;
     2194#   endif
     2195
     2196#  else
     2197
     2198#   if defined(RTASM_ARM64_USE_FEAT_LSE)
    20302199    union { uint32_t u; bool f; } fXchg;
    20312200    uint32_t u32Actual;
    20322201    __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
    2033 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2202#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    20342203                         "casalh    %w[uOldActual], %w[uNew], %[pMem]\n\t"
    2035 #   else
     2204#    else
    20362205                         RTASM_ARM_DMB_SY
    20372206                         "cash      %w[uOldActual], %w[uNew], %[pMem]\n\t"
    2038 #   endif
     2207#    endif
    20392208                         "cmp       %w[uOldActual], %w[uOldOrg]\n\t"
    20402209                         "cset      %w[fXchg], eq\n\t"
     
    20472216                         : "cc");
    20482217    *pu16Old = (uint16_t)u32Actual;
    2049 else
     2218 else
    20502219    union { uint16_t u; bool f; } fXchg;
    20512220    uint16_t u16ActualOld;
     
    20532222    __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
    20542223                         RTASM_ARM_DMB_SY
    2055 #   if defined(RT_ARCH_ARM64)
     2224#    if defined(RT_ARCH_ARM64)
    20562225                         "ldaxrh    %w[uOld], %[pMem]\n\t"
    20572226                         "cmp       %w[uOld], %w[uCmp]\n\t"
     
    20622231                         "1:\n\t"
    20632232                         "clrex\n\t"
    2064 #   else
     2233#    else
    20652234                         "ldrexh     %[uOld], %[pMem]\n\t"
    20662235                         "teq       %[uOld], %[uCmp]\n\t"
     
    20722241                         "1:\n\t"
    20732242                         /** @todo clrexne on armv7? */
    2074 #   endif
     2243#    endif
    20752244                         : [pMem]   "+Q"  (*pu16)
    20762245                         , [uOld]   "=&r" (u16ActualOld)
     
    20832252                         : "cc");
    20842253    *pu16Old = u16ActualOld;
    2085 endif
     2254 endif
    20862255    return fXchg.f;
     2256#  endif
    20872257
    20882258# else
     
    21772347
    21782348# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     2349
     2350#  ifdef RT_INLINE_ASM_USES_INTRIN
     2351#   if defined(RTASM_ARM64_USE_FEAT_LSE)
     2352    uint32_t uOldActual;
     2353#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2354    uOldActual = __casal32(pu32, u32Old, u32New);
     2355#    else
     2356    uOldActual = __casal32(pu32, u32Old, u32New);
     2357    __dmb(_ARM64_BARRIER_SY);
     2358#    endif
     2359     return (*pu32Old = uOldActual) == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
     2360#   else
     2361    return (*pu32Old = _InterlockedCompareExchange((char RT_FAR *)pu32, u32New, u32Old)) == u32Old;
     2362#   endif
     2363
     2364#  else
     2365
    21792366    union { uint32_t u; bool f; } fXchg;
    21802367    /* M1 bench:   match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
    21812368                mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
    2182 if defined(RTASM_ARM64_USE_FEAT_LSE)
     2369 if defined(RTASM_ARM64_USE_FEAT_LSE)
    21832370    __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
    2184 #   if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
     2371#    if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
    21852372                         "casal     %w[uOldActual], %w[uNew], %[pMem]\n\t"
    2186 #   else
     2373#    else
    21872374                         RTASM_ARM_DMB_SY
    21882375                         "cas       %w[uOldActual], %w[uNew], %[pMem]\n\t"
    2189 #   endif
     2376#    endif
    21902377                         "cmp       %w[uOldActual], %w[uOldOrg]\n\t"
    21912378                         "cset      %w[fXchg], eq\n\t"
     
    21972384                         , "[uOldActual]"     (u32Old)
    21982385                         : "cc");
    2199 else
     2386 else
    22002387    uint32_t u32ActualOld;
    22012388    uint32_t rcSpill;
    22022389    __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
    22032390                         RTASM_ARM_DMB_SY
    2204 #   if defined(RT_ARCH_ARM64)
     2391#    if defined(RT_ARCH_ARM64)
    22052392                         "ldaxr     %w[uOld], %[pMem]\n\t"
    22062393                         "cmp       %w[uOld], %w[uCmp]\n\t"
     
    22112398                         "1:\n\t"
    22122399                         "clrex\n\t"
    2213 #   else
     2400#    else
    22142401                         "ldrex     %[uOld], %[pMem]\n\t"
    22152402                         "teq       %[uOld], %[uCmp]\n\t"
     
    22212408                         "1:\n\t"
    22222409                         /** @todo clrexne on armv7? */
    2223 #   endif
     2410#    endif
    22242411                         : [pMem]   "+Q"  (*pu32)
    22252412                         , [uOld]   "=&r" (u32ActualOld)
     
    22322419                         : "cc");
    22332420    *pu32Old = u32ActualOld;
    2234 endif
     2421 endif
    22352422    return fXchg.f;
     2423#  endif
    22362424
    22372425# else
     
    28283016DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
    28293017{
     3018# ifdef RT_INLINE_ASM_USES_INTRIN
     3019    __dsb(_ARM64_BARRIER_SY);
     3020# else
    28303021    __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
     3022# endif
    28313023}
    28323024#else
     
    28553047# endif
    28563048#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3049# ifdef RT_INLINE_ASM_USES_INTRIN
     3050    __dmb(_ARM64_BARRIER_SY);
     3051# else
    28573052    __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
     3053# endif
    28583054#elif ARCH_BITS == 16
    28593055    uint16_t volatile u16;
     
    28863082# endif
    28873083#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3084# ifdef RT_INLINE_ASM_USES_INTRIN
     3085    __dmb(_ARM64_BARRIER_ST);
     3086# else
    28883087    __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
     3088# endif
    28893089#else
    28903090    ASMMemoryFence();
     
    29133113# endif
    29143114#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3115# ifdef RT_INLINE_ASM_USES_INTRIN
     3116    __dmb(_ARM64_BARRIER_LD);
     3117# else
    29153118    __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
     3119# endif
    29163120#else
    29173121    ASMMemoryFence();
     
    29293133{
    29303134#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3135
     3136# ifdef RT_INLINE_ASM_USES_INTRIN
     3137    return (uint8_t)__load_acquire8(pu8);
     3138
     3139# else
    29313140    uint32_t u32;
    2932 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
     3141#  if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
    29333142    __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
    29343143                         RTASM_ARM_DMB_SY
     
    29383147                           "0" (0)
    29393148                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    2940 # else
     3149#  else
    29413150    __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
    29423151                         RTASM_ARM_DMB_SY
    2943 if defined(RT_ARCH_ARM64)
    2944 #   if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
     3152 if defined(RT_ARCH_ARM64)
     3153#    if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
    29453154                         "ldurb     %w[uDst], %[pMem]\n\t"
    2946 #   else
     3155#    else
    29473156                         "ldxrb     %w[uDst], %[pMem]\n\t"
    29483157                         "clrex\n\t"
    2949 #   endif
    2950 else
     3158#    endif
     3159 else
    29513160                         "ldrexb    %[uDst], %[pMem]\n\t"
    29523161                         /** @todo clrex   */
    2953 endif
     3162 endif
    29543163                         : [uDst] "=&r" (u32)
    29553164                         : [pMem] "Q" (*pu8)
    29563165                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    2957 # endif
     3166#  endif
    29583167    return (uint8_t)u32;
     3168# endif
     3169
    29593170#else
    29603171    ASMMemoryFence();
     
    29733184{
    29743185#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3186
     3187# ifdef RT_INLINE_ASM_USES_INTRIN
     3188    return (uint8_t)__iso_volatile_load8((volatile char *)pu8);
     3189
     3190# else
     3191
    29753192    uint32_t u32;
    29763193    __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
    2977 # if defined(RT_ARCH_ARM64)
     3194#  if defined(RT_ARCH_ARM64)
    29783195                         "ldurb    %w[uDst], %[pMem]\n\t"
    2979 # else
     3196#  else
    29803197                         "ldrexb    %[uDst], %[pMem]\n\t" /** @todo fix this */
    2981 # endif
     3198#  endif
    29823199                         : [uDst] "=&r" (u32)
    29833200                         : [pMem] "Q" (*pu8));
    29843201    return (uint8_t)u32;
     3202# endif
    29853203#else
    29863204    return *pu8;    /* byte reads are atomic on x86 */
     
    30153233{
    30163234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3235
     3236# ifdef RT_INLINE_ASM_USES_INTRIN
     3237    return __iso_volatile_load8((volatile char *)pi8);
     3238
     3239# else
     3240
    30173241    int32_t i32;
    30183242    __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
    3019 # if defined(RT_ARCH_ARM64)
     3243#  if defined(RT_ARCH_ARM64)
    30203244                         "ldurb     %w[iDst], %[pMem]\n\t"
    3021 # else
     3245#  else
    30223246                         "ldrexb    %[iDst], %[pMem]\n\t" /** @todo fix this */
    3023 # endif
     3247#  endif
    30243248                         : [iDst] "=&r" (i32)
    30253249                         : [pMem] "Q" (*pi8));
    30263250    return (int8_t)i32;
     3251# endif
    30273252#else
    30283253    return *pi8;    /* byte reads are atomic on x86 */
     
    30413266    Assert(!((uintptr_t)pu16 & 1));
    30423267#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3268
     3269# ifdef RT_INLINE_ASM_USES_INTRIN
     3270    return (uint16_t)__load_acquire16(pu16);
     3271
     3272# else
     3273
    30433274    uint32_t u32;
    3044 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
     3275#  if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
    30453276    __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
    30463277                         RTASM_ARM_DMB_SY
     
    30503281                           "0" (0)
    30513282                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    3052 # else
     3283#  else
    30533284    __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
    30543285                         RTASM_ARM_DMB_SY
    3055 if defined(RT_ARCH_ARM64)
    3056 #   if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
     3286 if defined(RT_ARCH_ARM64)
     3287#    if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
    30573288                         "ldurh     %w[uDst], %[pMem]\n\t"
    3058 #   else
     3289#    else
    30593290                         "ldxrh     %w[uDst], %[pMem]\n\t"
    30603291                         "clrex\n\t"
    3061 #   endif
    3062 else
     3292#    endif
     3293 else
    30633294                         "ldrexh    %[uDst], %[pMem]\n\t"
    30643295                         /** @todo clrex    */
    3065 endif
     3296 endif
    30663297                         : [uDst] "=&r" (u32)
    30673298                         : [pMem] "Q" (*pu16)
    30683299                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    3069 # endif
     3300#  endif
    30703301    return (uint16_t)u32;
     3302# endif
     3303
    30713304#else
    30723305    ASMMemoryFence();
     
    30863319    Assert(!((uintptr_t)pu16 & 1));
    30873320#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3321
     3322# ifdef RT_INLINE_ASM_USES_INTRIN
     3323    return (uint16_t)__iso_volatile_load16((volatile int16_t *)pu16);
     3324
     3325# else
     3326
    30883327    uint32_t u32;
    30893328    __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
    3090 # if defined(RT_ARCH_ARM64)
     3329#  if defined(RT_ARCH_ARM64)
    30913330                         "ldurh     %w[uDst], %[pMem]\n\t"
    3092 # else
     3331#  else
    30933332                         "ldrexh    %[uDst], %[pMem]\n\t" /** @todo fix this */
    3094 # endif
     3333#  endif
    30953334                         : [uDst] "=&r" (u32)
    30963335                         : [pMem] "Q" (*pu16));
    30973336    return (uint16_t)u32;
     3337# endif
     3338
    30983339#else
    30993340    return *pu16;
     
    31303371    Assert(!((uintptr_t)pi16 & 1));
    31313372#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3373
     3374# ifdef RT_INLINE_ASM_USES_INTRIN
     3375    return __iso_volatile_load16(pi16);
     3376
     3377# else
     3378
    31323379    int32_t i32;
    31333380    __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
    3134 # if defined(RT_ARCH_ARM64)
     3381#  if defined(RT_ARCH_ARM64)
    31353382                         "ldurh     %w[iDst], %[pMem]\n\t"
    3136 # else
     3383#  else
    31373384                         "ldrexh    %[iDst], %[pMem]\n\t" /** @todo fix this */
    3138 # endif
     3385#  endif
    31393386                         : [iDst] "=&r" (i32)
    31403387                         : [pMem] "Q" (*pi16));
    31413388    return (int16_t)i32;
     3389# endif
     3390
    31423391#else
    31433392    return *pi16;
     
    31563405    Assert(!((uintptr_t)pu32 & 3));
    31573406#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3407
     3408# ifdef RT_INLINE_ASM_USES_INTRIN
     3409    return (uint32_t)__load_acquire32(pu32);
     3410
     3411# else
     3412
    31583413    uint32_t u32;
    3159 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
     3414#  if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
    31603415    __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
    31613416                         RTASM_ARM_DMB_SY
     
    31653420                           "0" (0)
    31663421                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    3167 # else
     3422#  else
    31683423    __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
    31693424                         RTASM_ARM_DMB_SY
    3170 if defined(RT_ARCH_ARM64)
    3171 #   if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
     3425 if defined(RT_ARCH_ARM64)
     3426#    if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
    31723427                         "ldur      %w[uDst], %[pMem]\n\t"
    3173 #   else
     3428#    else
    31743429                         "ldxr      %w[uDst], %[pMem]\n\t"
    31753430                         "clrex\n\t"
    3176 #   endif
    3177 else
     3431#    endif
     3432 else
    31783433                         "ldrex    %[uDst], %[pMem]\n\t"
    31793434                         /** @todo clrex    */
    3180 endif
     3435 endif
    31813436                         : [uDst] "=&r" (u32)
    31823437                         : [pMem] "Q" (*pu32)
    31833438                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    3184 # endif
     3439#  endif
    31853440    return u32;
     3441# endif
     3442
    31863443#else
    31873444    ASMMemoryFence();
     
    32043461    Assert(!((uintptr_t)pu32 & 3));
    32053462#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3463
     3464# ifdef RT_INLINE_ASM_USES_INTRIN
     3465    return (uint32_t)__iso_volatile_load32((volatile int32_t *)pu32);
     3466
     3467# else
     3468
    32063469    uint32_t u32;
    32073470    __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
    3208 # if defined(RT_ARCH_ARM64)
     3471#  if defined(RT_ARCH_ARM64)
    32093472                         "ldur      %w[uDst], %[pMem]\n\t"
    3210 # else
     3473#  else
    32113474                         "ldrex     %[uDst], %[pMem]\n\t" /** @todo fix this */
    3212 # endif
     3475#  endif
    32133476                         : [uDst] "=&r" (u32)
    32143477                         : [pMem] "Q" (*pu32));
    32153478    return u32;
     3479# endif
     3480
    32163481#else
    32173482# if ARCH_BITS == 16
     
    32543519    Assert(!((uintptr_t)pi32 & 3));
    32553520#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     3521
     3522# ifdef RT_INLINE_ASM_USES_INTRIN
     3523    return __iso_volatile_load32(pi32);
     3524
     3525# else
     3526
    32563527    int32_t i32;
    32573528    __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
    3258 # if defined(RT_ARCH_ARM64)
     3529#  if defined(RT_ARCH_ARM64)
    32593530                         "ldur      %w[iDst], %[pMem]\n\t"
    3260 # else
     3531#  else
    32613532                         "ldrex     %[iDst], %[pMem]\n\t" /** @todo thix this */
    3262 # endif
     3533#  endif
    32633534                         : [iDst] "=&r" (i32)
    32643535                         : [pMem] "Q" (*pi32));
    32653536    return i32;
     3537# endif
    32663538
    32673539#else
     
    33563628    Assert(!((uintptr_t)pu64 & 7));
    33573629
    3358 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
     3630# ifdef RT_INLINE_ASM_USES_INTRIN
     3631    u64 = (uint64_t)__load_acquire64(pu64);
     3632
     3633# else
     3634
     3635#  if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
    33593636    __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
    33603637                         RTASM_ARM_DMB_SY
     
    33643641                           "0" (0)
    33653642                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
    3366 # else
     3643#  else
    33673644    __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
    33683645                         RTASM_ARM_DMB_SY
    3369 #   if defined(RT_ARCH_ARM64)
    3370 #    if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
     3646#    if defined(RT_ARCH_ARM64)
     3647#     if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
    33713648                         "ldur      %[uDst], %[pMem]\n\t"
    3372 #    else
     3649#     else
    33733650                         "ldxr      %[uDst], %[pMem]\n\t"
    33743651                         "clrex\n\t"
    3375 #    endif
    3376 #   else
     3652#     endif
     3653#    else
    33773654                         "ldrexd    %[uDst], %H[uDst], %[pMem]\n\t"
    33783655                         /** @todo clrex    */
    3379 #   endif
     3656#    endif
    33803657                         : [uDst] "=&r" (u64)
    33813658                         : [pMem] "Q" (*pu64)
    33823659                           RTASM_ARM_DMB_SY_COMMA_IN_REG);
     3660#   endif
    33833661#  endif
    33843662# else
     
    34733751# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    34743752    Assert(!((uintptr_t)pu64 & 7));
     3753
     3754
     3755#  ifdef RT_INLINE_ASM_USES_INTRIN
     3756    u64 = (uint64_t)__iso_volatile_load64((volatile int64_t *)pu64);
     3757
     3758#  else
     3759
    34753760    __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
    3476 # if defined(RT_ARCH_ARM64)
     3761#   if defined(RT_ARCH_ARM64)
    34773762                         "ldur      %[uDst], %[pMem]\n\t"
    3478 # else
     3763#   else
    34793764                         "ldrexd    %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
    34803765                         /** @todo clrex? */
    3481 # endif
     3766#   endif
    34823767                         : [uDst] "=&r" (u64)
    34833768                         : [pMem] "Q" (*pu64));
     3769# endif
    34843770
    34853771# else
     
    39254211{
    39264212#if defined(RT_ARCH_ARM64)
     4213
     4214# ifdef RT_INLINE_ASM_USES_INTRIN
     4215    __dmb(_ARM64_BARRIER_SY);
     4216    __stlr8(pu8, u8);
     4217    __dmb(_ARM64_BARRIER_SY);
     4218# else
     4219
    39274220    /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
    39284221       as all byte accesses are single-copy atomic, which I think suffices here. */
    39294222    __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
    3930 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
     4223#  if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
    39314224                         RTASM_ARM_DMB_SY
    39324225                         "swpb      %w[uValue], wzr, %[pMem]\n\t"
    3933 # else
     4226#  else
    39344227                         RTASM_ARM_DMB_SY
    39354228                         "stlrb     %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
    3936 # endif
     4229#  endif
    39374230                         : [pMem]   "+Q" (*pu8)
    39384231                         : [uValue] "r" ((uint32_t)u8)
    39394232                         : );
     4233# endif
     4234
    39404235#else
    39414236    ASMAtomicXchgU8(pu8, u8);
     
    39934288{
    39944289#if defined(RT_ARCH_ARM64)
     4290
     4291# ifdef RT_INLINE_ASM_USES_INTRIN
     4292    __dmb(_ARM64_BARRIER_SY);
     4293    __stlr16(pu16, u16);
     4294    __dmb(_ARM64_BARRIER_SY);
     4295# else
     4296
    39954297    __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
    3996 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
     4298#  if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
    39974299                         RTASM_ARM_DMB_SY
    39984300                         "swph      %w[uValue], wzr, %[pMem]\n\t"
    3999 # else
     4301#  else
    40004302                         RTASM_ARM_DMB_SY
    40014303                         "stlrh     %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
    4002 # endif
     4304#  endif
    40034305                         : [pMem]   "+Q" (*pu16)
    40044306                         : [uValue] "r" ((uint32_t)u16)
    40054307                         : );
     4308# endif
     4309
    40064310#else
    40074311    ASMAtomicXchgU16(pu16, u16);
     
    40614365{
    40624366#if defined(RT_ARCH_ARM64)
     4367
     4368
     4369# ifdef RT_INLINE_ASM_USES_INTRIN
     4370    __dmb(_ARM64_BARRIER_SY);
     4371    __stlr32(pu32, u32);
     4372    __dmb(_ARM64_BARRIER_SY);
     4373# else
     4374
    40634375    __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
    4064 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
     4376#  if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
    40654377                         RTASM_ARM_DMB_SY
    40664378                         "swp      %w[uValue], wzr, %[pMem]\n\t"
    4067 # else
     4379#  else
    40684380                         RTASM_ARM_DMB_SY
    40694381                         "stlr     %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
    4070 # endif
     4382#  endif
    40714383                         : [pMem]   "+Q" (*pu32)
    40724384                         : [uValue] "r" (u32)
    40734385                         : "cc");
     4386# endif
     4387
    40744388#else
    40754389    ASMAtomicXchgU32(pu32, u32);
     
    41374451{
    41384452#if defined(RT_ARCH_ARM64)
     4453
     4454# ifdef RT_INLINE_ASM_USES_INTRIN
     4455    __dmb(_ARM64_BARRIER_SY);
     4456    __stlr64(pu64, u64);
     4457    __dmb(_ARM64_BARRIER_SY);
     4458# else
     4459
    41394460    __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
    4140 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
     4461#  if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
    41414462                         RTASM_ARM_DMB_SY
    41424463                         "swp      %[uValue], xzr, %[pMem]\n\t"
    4143 # else
     4464#  else
    41444465                         RTASM_ARM_DMB_SY /** @todo necessary? */
    41454466                         "stlr     %[uValue], %[pMem]\n\t"
    4146 # endif
     4467#  endif
    41474468                         : [pMem]   "+Q" (*pu64)
    41484469                         : [uValue] "r" (u64)
    41494470                         : );
     4471# endif
     4472
    41504473#else
    41514474    ASMAtomicXchgU64(pu64, u64);
     
    47955118DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
    47965119{
    4797 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
     5120# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
    47985121    u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
    47995122    return u64;
     
    51565479DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
    51575480{
    5158 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
     5481# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
    51595482    return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
    51605483
     
    53535676DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
    53545677{
    5355 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
     5678# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
    53565679    return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
    53575680
     
    55155838DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    55165839{
    5517 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     5840#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */
     5841    return (uint32_t)_InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
     5842
     5843#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    55185844#  if defined(RTASM_ARM64_USE_FEAT_LSE)
    55195845    uint32_t u32OldRet;
     
    55755901DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
    55765902{
    5577 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
     5903# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
    55785904    _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
    55795905
     
    57186044DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    57196045{
    5720 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     6046#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */
     6047    return (uint32_t)_InterlockedAnd((long volatile RT_FAR *)pu32, (long)u32);
     6048
     6049#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    57216050# if defined(RTASM_ARM64_USE_FEAT_LSE)
    57226051    uint32_t u32OldRet;
     
    57786107DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
    57796108{
    5780 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
     6109# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
    57816110    _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
    57826111
     
    59226251DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    59236252{
    5924 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     6253# if RT_INLINE_ASM_USES_INTRIN
     6254    return (uint32_t)_InterlockedXor((long volatile RT_FAR *)pu32, u32);
     6255
     6256#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    59256257# if defined(RTASM_ARM64_USE_FEAT_LSE)
    59266258    uint32_t u32OldRet;
     
    59826314DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    59836315{
    5984 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
     6316# if RT_INLINE_ASM_USES_INTRIN /** @todo This is too much... */
     6317    _InterlockedOr((long volatile RT_FAR *)pu32, u32);
     6318
     6319# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
    59856320#  if RT_INLINE_ASM_GNU_STYLE
    59866321    __asm__ __volatile__("orl %1, %0\n\t"
     
    60356370DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    60366371{
    6037 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     6372#if RT_INLINE_ASM_USES_INTRIN /** @todo Check what the compiler generates... */
     6373    return (uint32_t)_InterlockedOr_nf((long volatile RT_FAR *)pu32, u32);
     6374
     6375#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    60386376# if defined(RTASM_ARM64_USE_FEAT_LSE)
    60396377    uint32_t u32OldRet;
     
    60856423DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
    60866424{
    6087 # if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
     6425# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6426    _InterlockedOr64_nf((volatile int64_t *)pu64, (int64_t)u64);
     6427
     6428# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
    60886429    __asm__ __volatile__("orq %1, %q0\n\t"
    60896430                         : "=m" (*pu64)
     
    61496490DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    61506491{
    6151 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
     6492# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6493    _InterlockedAnd_nf((volatile long *)pu32, (long)u32);
     6494
     6495# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
    61526496#  if RT_INLINE_ASM_GNU_STYLE
    61536497    __asm__ __volatile__("andl %1, %0\n\t"
     
    62026546DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    62036547{
    6204 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     6548#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6549    return (uint32_t)_InterlockedAnd_nf((volatile long *)pu32, (long)u32);
     6550
     6551#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    62056552# if defined(RTASM_ARM64_USE_FEAT_LSE)
    62066553    uint32_t u32OldRet;
     
    62526599DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
    62536600{
    6254 # if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
     6601# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6602    _InterlockedAnd64_nf((volatile int64_t *)pu64, (int64_t)u64);
     6603
     6604# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
    62556605    __asm__ __volatile__("andq %1, %0\n\t"
    62566606                         : "=m" (*pu64)
     
    63166666DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    63176667{
    6318 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
     6668# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6669    _InterlockedXor_nf((volatile long *)pu32, (long)u32);
     6670
     6671# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
    63196672#  if RT_INLINE_ASM_GNU_STYLE
    63206673    __asm__ __volatile__("xorl %1, %0\n\t"
     
    63686721DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
    63696722{
    6370 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
     6723#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6724    return (uint32_t)_InterlockedXor_nf((volatile long *)pu32, (long)u32);
     6725
     6726#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
    63716727# if defined(RTASM_ARM64_USE_FEAT_LSE)
    63726728    uint32_t u32OldRet;
     
    64186774DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
    64196775{
    6420 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
     6776# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6777    return _InterlockedIncrement_nf((volatile long *)pu32);
     6778
     6779# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
    64216780    uint32_t u32;
    64226781#  if RT_INLINE_ASM_GNU_STYLE
     
    64846843DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
    64856844{
    6486 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
     6845# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
     6846    return _InterlockedDecrement_nf((volatile long *)pu32);
     6847
     6848# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
    64876849    uint32_t u32;
    64886850#  if RT_INLINE_ASM_GNU_STYLE
     
    66417003DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
    66427004{
    6643 #if defined(RT_ARCH_AMD64) && RT_INLINE_ASM_USES_INTRIN
     7005#if (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) && RT_INLINE_ASM_USES_INTRIN
    66447006    return _byteswap_uint64(u64);
    66457007
Note: See TracChangeset for help on using the changeset viewer.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette